diff options
Diffstat (limited to 'peekaboo/ruleset/expressions.py')
-rw-r--r-- | peekaboo/ruleset/expressions.py | 515 |
1 files changed, 515 insertions, 0 deletions
diff --git a/peekaboo/ruleset/expressions.py b/peekaboo/ruleset/expressions.py new file mode 100644 index 0000000..e4fe0ad --- /dev/null +++ b/peekaboo/ruleset/expressions.py @@ -0,0 +1,515 @@ +############################################################################### +# # +# Peekaboo Extended Email Attachment Behavior Observation Owl # +# # +# ruleset/ # +# expressions.py # +############################################################################### +# # +# Copyright (C) 2016-2019 science + computing ag # +# Based on pyparsing's eval_arith.py. +# Copyright 2009, 2011 Paul McGuire +# # +# This program is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or (at # +# your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, but # +# WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # +# General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program. If not, see <http://www.gnu.org/licenses/>. # +# # +############################################################################### + +""" A simple expression grammar used for writing generic rules. """ + +from future.builtins import super + +import logging +import operator +import re +from pyparsing import nums, alphas, alphanums, Word, Combine, Suppress, \ + oneOf, opAssoc, infixNotation, Literal, Keyword, Group, \ + delimitedList, QuotedString, ParserElement, ParseException +from peekaboo.ruleset import Result + + +logger = logging.getLogger(__name__) + + +class EvalBase(object): + """ Base class of evaluatable objects providing common infrastructure. """ + def __init__(self, tokens): + """ Just store the tokens for later evaluation. Expects all relevant + tokens to be grouped together in the first element of the token list + passed. This is the default for operand+operator+operand+... constructs + with infixNotation and can be forced for others using Group(): + + rvallist = Group(Suppress('[') + delimitedList(rval) + Suppress(']')) + """ + self.value = self.token = tokens[0] + self.context = None + self.convert() + self.string_repr_format = "(%s)" + + def convert(self): + """ Method to (optionally) convert the input token(s) into something + else. Particularly used for conversion to base types. """ + self.value = self.token + + def feedback(self, info): + """ Accept and process feedback evaluation children. """ + # by default propagate feedback upwards if we have a context + if self.context and 'parent' in self.context: + self.context['parent'].feedback(info) + + def subeval(self, expression, update=None): + """ Evaluate a subexpression with an updated evaluation context + containing common metadata such as that we're it's parent and optional + additional data. """ + context = self.context.copy() + context['parent'] = self + if update: + context.update(update) + return expression.eval(context) + + def set_context(self, context): + """ Save an evaluation context internally for later use by e.g. + feedback(). """ + self.context = context + + def eval(self, context): + """ Evaluate the object content against a context. Just return the + stored (and optionally converted) value by default and remember our + context for possible feedback to our parent or from our children. """ + self.set_context(context) + return self.value + + def __str__(self): + return self.string_repr_format % ( + " ".join(["%s" % x for x in self.token])) + + +class EvalBoolean(EvalBase): + """ Class to evaluate a parsed boolean constant """ + def convert(self): + logger.debug("Boolean: %s", self.value) + self.value = self.token == "True" + + +class EvalInteger(EvalBase): + """ Class to evaluate a parsed integer constant """ + def convert(self): + logger.debug("Integer: %s", self.token) + self.value = int(self.token) + + +class EvalReal(EvalBase): + """ Class to evaluate a parsed real constant """ + def convert(self): + logger.debug("Real: %s", self.token) + self.value = float(self.token) + + +class EvalString(EvalBase): + """ Class to evaluate a parsed string constant """ + def convert(self): + logger.debug("String: %s", self.token) + self.value = self.token + + +class OperatorRegex(object): + """ A class implementing operators on regular expressions. """ + def __init__(self, string): + self.regex = re.compile(string) + + @staticmethod + def compare_op_impl(function, other): + """ Implement handling of iterable operands. """ + if isinstance(other, (list, set)): + for val in other: + logger.debug("Regular expression match: %s == %s", + function, val) + if function(val): + return True + return False + + return function(other) + + def __eq__(self, other): + """ Implement equality using re.match """ + logger.debug("Regular expression match: %s == %s", self.regex, other) + return self.compare_op_impl(self.regex.match, other) + + def __contains__(self, other): + """ Implement membership using re.search """ + logger.debug("Regular expression search: %s in %s", self.regex, other) + return self.compare_op_impl(self.regex.search, other) + + +class EvalRegex(EvalBase): + """ Class to evaluate a regular expression """ + def convert(self): + logger.debug("Regular expression: %s", self.token) + self.value = OperatorRegex(self.token) + + def eval(self, context): + self.set_context(context) + self.feedback({'regex_parsed': True}) + return self.value + + +class RegexIterableMixIn(object): + """ Common functionality for lists and sets containing regular expressions + with different behaviour of membership operators. """ + def __eq__(self, other): + if not isinstance(other, (list, set)): + other = [other] + + # in contrast to normal lists, a list of regexes compared to a list + # of strings is considered equal if any regex matches any string + for regex in self: + logger.debug("Eval regex: %s == %s", regex, other) + if regex == other: + return True + + return False + + def __contains__(self, item): + for regex in self: + logger.debug("Eval regex: %s in %s", regex, item) + # we implement "regex in string" of our grammar as "string in + # regex" so that our overridden operator + # regex.__contains__(string) is called and searching can be + # done. Otherwise error "TypeError: 'in <string>' requires + # string as left operand, not OperatorRegex" would ensue. + if item in regex: + return True + + return False + + +class RegexList(RegexIterableMixIn, list): + """ A list containing regular expressions with different behaviour of + membership operators. """ + + +class RegexSet(RegexIterableMixIn, set): + """ A set containing regular expressions with different behaviour of + membership operators. """ + + +class EvalRegexIterableMixIn(object): + """ Common functionality for iterables which may contain regular + expressions. """ + def __init__(self, tokens): + super().__init__(tokens) + self.contains_regexes = False + + def feedback(self, info): + """ Mark this object as containing regular expressions if a child + object reports so in its feedback to us. """ + if 'regex_parsed' in info: + self.contains_regexes = True + del info['regex_parsed'] + + super().feedback(info) + + +class EvalList(EvalRegexIterableMixIn, EvalBase): + """ Class to evaluate a parsed list """ + def __init__(self, token): + super().__init__(token) + self.string_repr_format = "[%s]" + + def eval(self, context): + self.set_context(context) + logger.debug("List: %s", self.value) + ret = [] + for val in self.value: + ret.append(self.subeval(val)) + if self.contains_regexes: + return RegexList(ret) + return ret + + +class EvalSet(EvalRegexIterableMixIn, EvalBase): + """ Class to evaluate a parsed list """ + def __init__(self, token): + super().__init__(token) + self.string_repr_format = "{%s}" + + def eval(self, context): + self.set_context(context) + logger.debug("Set: %s", self.value) + ret = set() + for val in self.value: + ret.add(self.subeval(val)) + if self.contains_regexes: + return RegexSet(ret) + return ret + + +class IdentifierMissingException(KeyError): + pass + + +class EvalIdentifier(EvalBase): + """ Class to evaluate a parsed object name """ + def eval(self, context): + logger.debug("Identifier: %s", self.value) + if 'member' in context and context['member']: + return self.value + + try: + return context['variables'][self.value] + except KeyError as error: + raise IdentifierMissingException(error.args[0]) + + +class EvalResult(EvalBase): + """ Class to evaluate a analysis result """ + def convert(self): + logger.debug("Result: %s", self.token) + result_map = { + 'fail': Result.failed, + 'ignore': Result.ignored, + } + + if self.token in result_map: + self.value = result_map[self.token] + else: + self.value = Result[self.token] + + +class EvalModifier(EvalBase): + """ Class to evaluate typical single-operand modifier expressions such as + explicit sign change, bitwise and logical not. """ + def __init__(self, tokens): + super().__init__(tokens) + self.operator, self.value = tokens[0] + + def eval(self, context): + self.set_context(context) + val = self.subeval(self.value) + if self.operator == '+': + return val + elif self.operator == '-': + return -val + elif self.operator == '~': + return ~val + elif self.operator == 'not': + return not val + + raise ValueError('Invalid operator %s' % self.operator) + + +class EvalPower(EvalBase): + """ Class to evaluate exponentiation expressions """ + def eval(self, context): + self.set_context(context) + res = self.subeval(self.value[-1]) + for val in self.value[-3::-2]: + res = self.subeval(val)**res + + return res + + +def operator_operands(tokenlist): + """ Generator to extract operators and operands in pairs """ + iterator = iter(tokenlist) + while True: + try: + yield (next(iterator), next(iterator)) + except StopIteration: + break + + +class EvalArith(EvalBase): + """ Class to evaluate typical arithmetic and bitwise operations like + addition, multiplication, division and shifts expressions. Operator + precedence is handled by the order in which they're evaluated by the + parser, i.e. given to infixNotation. """ + def eval(self, context): + self.set_context(context) + ret = self.subeval(self.value[0]) + for op, val in operator_operands(self.value[1:]): + if op == '+': + ret += self.subeval(val) + elif op == '-': + ret -= self.subeval(val) + elif op == '*': + ret *= self.subeval(val) + elif op == '/': + ret /= self.subeval(val) + elif op == '//': + ret //= self.subeval(val) + elif op == '%': + ret %= self.subeval(val) + elif op == '<<': + ret <<= self.subeval(val) + elif op == '>>': + ret >>= self.subeval(val) + elif op == '.': + ret = getattr(ret, self.subeval(val, update={'member': True})) + elif op == "->": + if ret: + ret = self.subeval(val) + else: + ret = None + else: + raise ValueError('Invalid operator %s' % op) + + return ret + + +class EvalLogic(EvalBase): + """ Class to evaluate comparison expressions """ + def __init__(self, tokens): + super().__init__(tokens) + self.operator_map = { + "<": operator.lt, + "<=": operator.le, + ">": operator.gt, + ">=": operator.ge, + "==": operator.eq, + "!=": operator.ne, + "in": EvalLogic.in_, + "not in": EvalLogic.not_in, + "is": operator.is_, + "is not": operator.is_not, + "isdisjoint": lambda a, b: a.isdisjoint(b), + "and": operator.and_, + "or": operator.or_, + } + + @staticmethod + def in_(a, b): + """ Literally implement membership test. Make it a static method so we + can do identity checks. Do not use operator.contains because it needs + operands swapped. """ + return a in b + + @staticmethod + def not_in(a, b): + """ Naively implement non-membership test. """ + return a not in b + + @staticmethod + def handle_regexes(function, val1, val2): + """ Special handling of equality and membership checks for regular + expressions. """ + if (function in (operator.eq, operator.ne) + and isinstance(val2, (OperatorRegex, RegexIterableMixIn))): + # swap operands around in case the first does not contain any regex + # but the other does to reliably reroute to our overridden __eq__ + # operator, just do that always to keep checks simple since + # (in)equality is commutative anyway + val1, val2 = val2, val1 + elif (function in (EvalLogic.in_, EvalLogic.not_in) + and isinstance(val1, (OperatorRegex, RegexIterableMixIn))): + # "<regex> in <string>|<list-of-strings>" of our grammar directly + # implemented using the "in" operator would call + # <string>|<list-of-strings>.__contains__(<regex>) which we cannot + # override with reasonable effort. To get a call of + # <regex>.__contains__(<string>|<list-of-strings>) we need to + # switch operands. Otherwise error "TypeError: 'in <string>' + # requires string as left operand, not OperatorRegex" would ensue. + val1, val2 = val2, val1 + + # nothing special + return function(val1, val2) + + def eval(self, context): + self.set_context(context) + val1 = self.subeval(self.value[0]) + for op, parseobj in operator_operands(self.value[1:]): + val2 = self.subeval(parseobj) + logger.debug("Comparison: %s %s %s", val1, op, val2) + function = self.operator_map[op] + if not self.handle_regexes(function, val1, val2): + break + val1 = val2 + else: + return True + + return False + + +class ExpressionParser(object): + """ Define and run the parser. """ + def __init__(self): + # speed up infixNotation considerably at the price of some cache memory + ParserElement.enablePackrat() + + boolean = Keyword('True') | Keyword('False') + integer = Word(nums) + real = Combine(Word(nums) + "." + Word(nums)) + string = (QuotedString('"', escChar='\\') + | QuotedString("'", escChar='\\')) + regex = QuotedString('/', escChar='\\') + identifier = Word(alphas, alphanums + '_') + dereference = infixNotation(identifier, [ + (Literal('.'), 2, opAssoc.LEFT, EvalArith), + ]) + result = (Keyword('bad') | Keyword('fail') | Keyword('good') + | Keyword('ignore') | Keyword('unknown')) + rval = boolean | real | integer | string | regex | result | dereference + rvallist = Group(Suppress('[') + delimitedList(rval) + Suppress(']')) + rvalset = Group(Suppress('{') + delimitedList(rval) + Suppress('}')) + operand = rval | rvallist | rvalset + + # parse actions replace the parsed tokens with an instantiated object + # which we can later call into for evaluation of its content + boolean.setParseAction(EvalBoolean) + integer.setParseAction(EvalInteger) + real.setParseAction(EvalReal) + string.setParseAction(EvalString) + regex.setParseAction(EvalRegex) + identifier.setParseAction(EvalIdentifier) + result.setParseAction(EvalResult) + rvallist.setParseAction(EvalList) + rvalset.setParseAction(EvalSet) + + identity_test = Keyword('is') + ~Keyword('not') | Combine( + Keyword('is') + Keyword('not'), adjacent=False, joinString=' ') + membership_test = Keyword('in') | Combine( + Keyword('not') + Keyword('in'), adjacent=False, joinString=' ') + comparison_op = oneOf('< <= > >= != == isdisjoint') + comparison = identity_test | membership_test | comparison_op + + self.parser = infixNotation(operand, [ + (Literal('**'), 2, opAssoc.LEFT, EvalPower), + (oneOf('+ - ~'), 1, opAssoc.RIGHT, EvalModifier), + (oneOf('* / // %'), 2, opAssoc.LEFT, EvalArith), + (oneOf('+ -'), 2, opAssoc.LEFT, EvalArith), + (oneOf('<< >>'), 2, opAssoc.LEFT, EvalArith), + (Literal('&'), 2, opAssoc.LEFT, EvalArith), + (Literal('^'), 2, opAssoc.LEFT, EvalArith), + (Literal('|'), 2, opAssoc.LEFT, EvalArith), + (comparison, 2, opAssoc.LEFT, EvalLogic), + (Keyword('not'), 1, opAssoc.RIGHT, EvalModifier), + (Keyword('and'), 2, opAssoc.LEFT, EvalLogic), + (Keyword('or'), 2, opAssoc.LEFT, EvalLogic), + (Keyword('->'), 2, opAssoc.LEFT, EvalArith), + ]) + + def parse(self, expression): + """ Parse an expression and return an object supporting evaluation of + that expression against a context. """ + try: + return self.parser.parseString(expression, parseAll=True)[0] + except ParseException as parse_error: + col = parse_error.col + raise SyntaxError( + "Expression parse error near character %d: %s>>%s<<%s" % ( + parse_error.col, expression[0:col], expression[col], + expression[col+1:])) + + +if __name__ == '__main__': + print(ExpressionParser().parse('foo == (bar - blub)')) |