Source code for eql.parser

"""Python parser functions for EQL syntax."""
from __future__ import unicode_literals

import datetime
from collections import OrderedDict

import tatsu
import tatsu.exceptions
import tatsu.objectmodel
import tatsu.semantics
import tatsu.walkers

from eql.ast import *  # noqa: F401
from eql.errors import ParseError, SchemaError
from eql.etc import get_etc_file
from eql.schema import EVENT_TYPE_ANY, check_event_name
from eql.utils import is_string, to_unicode


__all__ = (
    "get_preprocessor",
    "parse_definition",
    "parse_definitions",
    "parse_expression",
    "parse_query",
    "parse_analytic",
    "parse_analytics",
)


# Used for time units
SECOND = 1
MINUTE = 60 * SECOND
HOUR = MINUTE * 60
DAY = HOUR * 24

# Time units (shorthand)
units = {
    'second': SECOND,
    'minute': MINUTE,
    'hour': HOUR,
    'hr': HOUR,
    'day': DAY
}


GRAMMAR = None
tatsu_parser = None


class EqlWalker(tatsu.walkers.NodeWalker):
    """Walker of Tatsu semantic model to convert it into a EQL AST."""

    def __init__(self, implied_base=False, implied_any=False, subqueries=True, pipes=True, preprocessor=None):
        """Walker for building an EQL syntax tree from a Tatsu syntax tree.

        :param bool implied_any: Allow for event queries to skip event type and WHERE, replace with 'any where ...'
        :param bool implied_base: Allow for queries to be built with only pipes. Base query becomes 'any where true'
        :param bool subqueries: Toggle support for subqueries (sequence, join, named of, etc.)
        :param bool pipes: Toggle support for pipes
        :param PreProcessor preprocessor: Use an EQL preprocessor to expand definitions and constants while parsing
        """
        super(EqlWalker, self).__init__()
        self.implied_base = implied_base
        self.implied_any = implied_any
        self.preprocessor = preprocessor or PreProcessor()
        self._eql_walker = AstWalker()
        self._subqueries_enabled = subqueries
        self._pipes_enabled = pipes

    @staticmethod
    def _error(node, message, end=False, cls=ParseError):
        """Callback function to walk the AST."""
        params = dict(node.ast)
        for k, value in params.items():
            if isinstance(value, list):
                params[k] = ', '.join([v.render() if isinstance(v, EqlNode) else to_unicode(v) for v in value])
        message = message.format(**params)
        lines = node.parseinfo.text_lines()
        line_number = node.parseinfo.line
        if line_number >= len(lines):
            line_number = len(lines) - 1
        bad_line = lines[line_number].rstrip()
        pos = node.parseinfo.endpos if end else node.parseinfo.pos
        return cls(message, line_number, pos, bad_line)

    def _walk_default(self, node, *args, **kwargs):
        """Callback function to walk the AST."""
        if isinstance(node, list):
            return [self.walk(n, *args, **kwargs) for n in node]
        return node

    def walk(self, node, *args, **kwargs):
        """Optimize the AST while walking it."""
        output = super(EqlWalker, self).walk(node, *args, **kwargs)
        if isinstance(output, EqlNode):
            return output.optimize()
        return output

    # literals
    def walk__literal(self, node):
        """Callback function to walk the AST."""
        literal = self.walk(node.value)
        if literal is None:
            return literal
        elif is_string(literal):
            # If a 'raw' string is detected, then only unescape the quote character
            if node.text.startswith('?'):
                quote_char = node.text[-1]
                literal = literal.replace("\\" + quote_char, quote_char)
            else:
                literal = String.unescape(literal)
            return String(to_unicode(literal))
        elif isinstance(literal, bool):
            return Boolean(literal)
        else:
            return Number(literal)

    def walk__time_range(self, node):
        """Callback function to walk the AST."""
        val = self.walk(node.val)
        unit = self.walk(node.unit)
        for name, interval in units.items():
            if name.startswith(unit.rstrip('s') or 's'):
                return TimeRange(datetime.timedelta(seconds=val * interval))

        raise self._error(node.unit, "Unknown time unit")

    # fields
    def walk__field(self, node):
        """Callback function to walk the AST."""
        reserved = 'true', 'false', 'null'
        if node.base in reserved:
            if len(node.sub_fields) != 0:
                raise self._error(node, "Invalid field name {base}")
            elif node.base == 'true':
                return Boolean(True)
            elif node.base == 'false':
                return Boolean(False)
            elif node.base == 'null':
                return Null()
            else:
                raise self._error(node.base, "Unhandled literal")

        path = []

        for sub_field in self.walk(node.sub_fields):
            if is_string(sub_field) and sub_field in reserved:
                raise self._error(node, "Invalid attribute {}".format(sub_field))
            path.append(sub_field)

        if not path and node.base in self.preprocessor.constants:
            constant = self.preprocessor.constants[node.base]
            return constant.value

        return Field(node.base, path)

    # comparisons
    def walk__equals(self, node):
        """Callback function to walk the AST."""
        # May be double or single equals
        return '=='

    def walk__comparison(self, node):
        """Callback function to walk the AST."""
        left = self.walk(node.left)
        right = self.walk(node.right)
        op = self.walk(node.op)

        # there is no special comparator for wildcards, just look for * in the string
        if isinstance(right, String) and '*' in right.value:
            if op == Comparison.EQ:
                return FunctionCall('wildcard', [left, right])
            elif op == Comparison.NE:
                return ~ FunctionCall('wildcard', [left, right])

        return Comparison(left, op, right)

    def walk__and_terms(self, node):
        """Callback function to walk the AST."""
        terms = self.walk(node.terms)
        term = And(terms)
        return term

    def walk__or_terms(self, node):
        """Callback function to walk the AST."""
        terms = self.walk(node.terms)
        term = Or(terms)
        return term

    def walk__not_term(self, node):
        """Callback function to walk the AST."""
        term = Not(self.walk(node.t))
        return term

    def walk__in_set(self, node):
        """Callback function to walk the AST."""
        expr = self.walk(node.expr)
        container = self.walk(node.container)  # type: list[Expression]
        return InSet(expr, container)

    def walk__function_call(self, node):
        """Callback function to walk the AST."""
        args = self.walk(node.args) or []

        if node.name in self.preprocessor.macros:
            macro = self.preprocessor.macros[node.name]
            return macro.expand(args, self._eql_walker)

        return FunctionCall(node.name, args)

    # queries
    def walk__event_query(self, node):
        """Callback function to walk the AST."""
        if node.ast.get('event_type') is None:
            event_type = EVENT_TYPE_ANY
            if not self.implied_any:
                raise self._error(node, "Missing event type and 'where' condition")
        else:
            event_type = node.event_type
            if not check_event_name(event_type):
                raise self._error(node, "Invalid event type: {event_type}", cls=SchemaError)

        return EventQuery(event_type, self.walk(node.cond))

    def walk__pipe(self, node):
        """Callback function to walk the AST."""
        if not self._pipes_enabled:
            raise self._error(node, "Pipes not supported")

        pipe_cls = PipeCommand.lookup.get(node.name)
        if pipe_cls is None:
            raise self._error(node, "Unknown pipe '{name}'")

        pipe = pipe_cls(self.walk(node.args))  # type: PipeCommand
        num_args = len(pipe.arguments)

        error_node = node

        # Try to line up the error message withe the argument that went wrong
        # Strings and numbers don't generate tatsu nodes, so its difficult to recover parseinfo

        if pipe.minimum_args is not None and num_args < pipe.minimum_args:
            message = "Expected {} argument(s) to pipe '{}'".format(pipe.minimum_args, node.name)
            raise self._error(error_node, message, end=True)

        elif pipe.maximum_args is not None and num_args > pipe.maximum_args:
            message = "Expected up to {} argument(s) to pipe '{}'".format(pipe.maximum_args, node.name)
            if isinstance(node.args[pipe.maximum_args], tatsu.semantics.Node):
                error_node = node.args[pipe.maximum_args]
            raise self._error(error_node, message)

        bad_index = pipe.validate()
        if bad_index is not None:
            if isinstance(node.args[bad_index], tatsu.semantics.Node):
                error_node = node.args[bad_index]
            raise self._error(error_node, "Invalid arguments to '{}'".format(node.name))
        return pipe

    def walk__piped_query(self, node):
        """Callback function to walk the AST."""
        if node.query is None:
            first = EventQuery(EVENT_TYPE_ANY, Boolean(True))
            if not self.implied_base:
                raise self._error(node, "Missing base query")
        else:
            first = self.walk(node.query)
        return PipedQuery(first, self.walk(node.pipes))

    def walk__named_query(self, node):
        """Callback function to walk the AST."""
        if not self._subqueries_enabled:
            raise self._error(node, "Subqueries not supported")

        if node.name not in NamedSubquery.supported_types:
            options = ', '.join(NamedSubquery.supported_types)
            raise self._error(node, "Unknown subquery '{name}' of. Available options are: " + options)
        return NamedSubquery(node.name, self.walk(node.query))

    def walk__named_params(self, node, get_param=None):
        """Callback function to walk the AST."""
        params = OrderedDict()
        if get_param is None and len(node.params) > 0:
            raise self._error(node, "Unexpected parameters")

        for param in node.params:
            key, value = get_param(param)
            if key in params:
                raise self._error(param, "Repeated parameter '{k}'")
            params[key] = value
        return NamedParams(params)

    def walk__subquery_by(self, node, num_values=None, get_param=None):
        """Callback function to walk the AST."""
        if not self._subqueries_enabled:
            raise self._error(node, "Subqueries not supported")

        if num_values is not None and num_values != len(node.join_values):
            if len(node.join_values) == 0:
                error_node = node.query
                end = True
            else:
                end = False
                error_node = node.join_values[max(num_values, len(node.join_values)) - 1]
            raise self._error(error_node, "Expected {} value(s)".format(num_values), end=end)

        join_values = self.walk(node.join_values)
        params = self.walk(node.params, get_param=get_param)

        query = self.walk(node.query)
        return SubqueryBy(query, params, join_values)

    def walk__join(self, node):
        """Callback function to walk the AST."""
        if not self._subqueries_enabled:
            raise self._error(node, "Subqueries not supported")

        shared = []
        close = None

        if node.ast.get('shared_by'):
            shared = self.walk(node.shared_by)

        # Figure out how many fields are joined by in the first query, and match across all
        first = self.walk(node.queries[0])
        num_values = len(first.join_values)
        queries = [first]  # type: list[SubqueryBy]
        queries.extend(self.walk(node.queries[1:], num_values=num_values))

        for query in queries:
            query.join_values = shared + query.join_values

        if node.ast.get('until'):
            close = self.walk(node.until, num_values=num_values)  # type: SubqueryBy
            close.join_values = shared + close.join_values

        return Join(queries, close)

    def get_sequence_parameter(self, node):
        """Validate that sequence parameters are working."""
        key, value = self.walk([node.k, node.v])
        value = TimeRange.convert(value)

        if key != 'maxspan':
            raise self._error(node, "Unknown sequence parameter '{}'".format(key))

        if not TimeRange.convert(value) or value.delta < datetime.timedelta(0):
            error_node = node.v if isinstance(node.v, tatsu.objectmodel.Node) else node
            raise self._error(error_node, "Invalid value for '{}'".format(key))

        return key, value

    def get_sequence_term_parameter(self, param_node):
        """Validate that sequence parameters are working for items in sequence."""
        key, value = self.walk([param_node.k, param_node.ast.get('v', Boolean(True))])
        if value is None:
            value = Boolean(True)

        if key != 'fork':
            raise self._error(param_node, "Unknown parameter '{}'".format(key))

        elif not isinstance(value, (Boolean, Number)) or value.value not in (True, False, 0, 1):
            raise self._error(param_node, "Invalid value for '{}'".format(key))

        return key, Boolean(bool(value.value))

    def walk__sequence(self, node):
        """Callback function to walk the AST."""
        if not self._subqueries_enabled:
            raise self._error(node, "Subqueries not supported")

        shared = []
        close = None
        params = None

        if node.ast.get('shared_by'):
            shared = self.walk(node.shared_by)

        if node.ast.get('params'):
            params = self.walk(node.params, get_param=self.get_sequence_parameter)

        # Figure out how many fields are joined by in the first query, and match across all
        first = self.walk(node.queries[0])
        num_values = len(first.join_values)
        queries = [first]  # type: list[SubqueryBy]
        queries.extend(self.walk(node.queries[1:], num_values=num_values, get_param=self.get_sequence_term_parameter))

        for query in queries:
            query.join_values = shared + query.join_values

        if node.ast.get('until'):
            close = self.walk(node.until, num_values=num_values)
            close.join_values = shared + close.join_values

        return Sequence(queries, params, close)

    # definitions
    def walk__macro(self, node):
        """Callback function to walk the AST."""
        return Macro(node.name, node.params, self.walk(node.body))

    def walk__constant(self, node):
        """Callback function to walk the AST."""
        return Constant(node.name, self.walk(node.value))


def _parse(text, start=None, preprocessor=None, implied_any=False, implied_base=False, pipes=True, subqueries=True):
    """Function for parsing EQL with arbitrary entry points.

    :param str text: EQL source text to parse
    :param str start: Entry point for the EQL grammar
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool implied_base: Allow for queries to be built with only pipes. Base query becomes 'any where true'
    :param bool pipes: Toggle support for pipes
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :rtype: EqlNode
    """
    global GRAMMAR, tatsu_parser

    if tatsu_parser is None:
        GRAMMAR = get_etc_file('eql.ebnf')
        tatsu_parser = tatsu.compile(GRAMMAR, parseinfo=True, semantics=tatsu.semantics.ModelBuilderSemantics())

    if not text.strip():
        raise ParseError("No text specified", 0, 0, text)

    # Convert everything to unicode
    text = to_unicode(text)
    walker = EqlWalker(implied_any=implied_any, implied_base=implied_base,
                       preprocessor=preprocessor, pipes=pipes, subqueries=subqueries)

    try:
        model = tatsu_parser.parse(text, rule_name=start, start=start, parseinfo=True)
        eql_node = walker.walk(model)
        return eql_node
    except tatsu.exceptions.FailedParse as e:
        info = e.buf.line_info(e.pos)
        message = e.message
        line = info.line
        col = info.col
        source = info.text.rstrip()
        if not source:
            source = text.strip().splitlines()[-1].strip()
            col = max(len(source) - 1, 0)
        raise ParseError(message, line, col, source)


def parse_base_query(text, implied_any=False, implied_base=False, preprocessor=None, subqueries=True):
    """Parse an EQL event query without pipes.

    :param str text: EQL source text to parse
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool implied_base: Allow for queries to be built with only pipes. Base query becomes 'any where true'
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :rtype: PipedQuery
    """
    return _parse(text, 'base_query',
                  implied_any=implied_any, implied_base=implied_base, preprocessor=preprocessor, subqueries=subqueries)


def parse_event_query(text, implied_any=False, implied_base=False, preprocessor=None, subqueries=True):
    """Parse an EQL event query in the form ``<event-type> where <condition>``.

    :param str text: EQL source text to parse
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool implied_base: Allow for queries to be built with only pipes. Base query becomes 'any where true'
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :rtype: EventQuery
    """
    return _parse(text, 'event_query',
                  implied_any=implied_any, implied_base=implied_base, preprocessor=preprocessor, subqueries=subqueries)


[docs]def parse_query(text, implied_any=False, implied_base=False, preprocessor=None, subqueries=True, pipes=True):
    """Parse a full EQL query with pipes.

    :param str text: EQL source text to parse
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool implied_base: Allow for queries to be built with only pipes. Base query becomes 'any where true'
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :param bool pipes: Toggle support for pipes
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :rtype: PipedQuery
    """
    return _parse(text,  'single_query', implied_any=implied_any, implied_base=implied_base, preprocessor=preprocessor,
                  subqueries=subqueries, pipes=pipes)


[docs]def parse_expression(text, implied_any=False, preprocessor=None, subqueries=True):
    """Parse an EQL expression and return the AST.

    :param str text: EQL source text to parse
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :rtype: Expression
    """
    return _parse(text, start='single_expression',
                  implied_any=implied_any, preprocessor=preprocessor, subqueries=subqueries)


[docs]def parse_analytic(analytic_info, preprocessor=None, **kwargs):
    """Parse an EQL analytic from a dictionary with metadata.

    :param dict analytic_info: EQL dictionary with metadata and a query to convert to an analytic.
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :param kwargs: Additional arguments to pass to :func:`~parse_query`
    :rtype: EqlAnalytic
    """
    dct = analytic_info.copy()
    text = dct['query']
    query = parse_query(text, preprocessor=preprocessor, **kwargs)
    dct['query'] = query
    return EqlAnalytic(**dct)


[docs]def parse_analytics(analytics, preprocessor=None, **kwargs):
    """Parse EQL analytics from a list of dictionaries.

    :param list[dict] analytics: EQL dictionary with metadata to convert to an analytic.
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :param kwargs: Additional arguments to pass to :func:`~parse_query`
    :rtype: list[EqlAnalytic]
    """
    if preprocessor is None:
        preprocessor = PreProcessor()
    return [parse_analytic(r, preprocessor=preprocessor, **kwargs) for r in analytics]


def parse_definition(text, preprocessor=None, implied_any=False, subqueries=True):
    """Parse a single EQL definition.

    :param str text: EQL source to parse
    :param PreProcessor preprocessor: Optional preprocessor to expand definitions and constants
    :param bool implied_any: Allow for event queries to skip event type and WHERE, replace with 'any where ...'
    :param bool subqueries: Toggle support for subqueries (sequence, join, named of, etc.)
    :rtype: Definition
    """
    return _parse(text, start='single_definition', preprocessor=preprocessor,
                  implied_any=implied_any, subqueries=subqueries)


[docs]def parse_definitions(text, preprocessor=None, implied_any=False, subqueries=True):
    """Parse EQL preprocessor definitions from source.

    :param str text: EQL source to parse
    :param PreProcessor preprocessor: Use an EQL preprocessor to expand definitions and constants while parsing
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``sequence``, ``join``, ``descendant of``, ``child of`` and ``event of``
    :rtype: list[Definition]
    """
    return _parse(text, start='definitions', preprocessor=preprocessor, implied_any=implied_any, subqueries=subqueries)


[docs]def get_preprocessor(text, implied_any=False, subqueries=None, preprocessor=None):
    """Parse EQL definitions and get a :class:`~eql.ast.PreProcessor`.

    :param str text: EQL source to parse
    :param PreProcessor preprocessor: Use an existing EQL preprocessor while parsing definitions
    :param bool implied_any: Allow for event queries to match on any event type when a type is not specified.
         If enabled, the query ``process_name == "cmd.exe"`` becomes ``any where process_name == "cmd.exe"``
    :param bool subqueries: Toggle support for subqueries, which are required by
        ``descendant of``, ``child of`` and ``event of``
    :rtype: PreProcessor
    """
    definitions = parse_definitions(text, implied_any=implied_any, subqueries=subqueries, preprocessor=preprocessor)

    # inherit all the definitions from the old one, and add to them
    if preprocessor is None:
        new_preprocessor = PreProcessor()
    else:
        new_preprocessor = preprocessor.copy()

    new_preprocessor.add_definitions(definitions)
    return new_preprocessor