From 3d1d47deb949b37a2c89ea47db1798fbef588083 Mon Sep 17 00:00:00 2001 From: James Macdonell Date: Thu, 30 May 2024 11:05:14 -0700 Subject: [PATCH 1/2] Recursive _find_es_dict_by_key #1450 Significant rewrite. Attempting to better handle cases where keyword is actually part of the fieldname. --- elastalert/util.py | 157 ++++++++++++++++++--------------------------- tests/util_test.py | 18 ++++++ 2 files changed, 79 insertions(+), 96 deletions(-) diff --git a/elastalert/util.py b/elastalert/util.py index 04c0f274..8589e3a5 100644 --- a/elastalert/util.py +++ b/elastalert/util.py @@ -44,105 +44,70 @@ def new_get_event_ts(ts_field): return lambda event: lookup_es_key(event[0], ts_field) -def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = ".keyword") -> tuple[dict, str]: - """ Performs iterative dictionary search based upon the following conditions: - - 1. Subkeys may either appear behind a full stop (.) or at one lookup_dict level lower in the tree. +def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = "keyword") -> tuple[dict, str]: + """ Performs a divide-and-conquer recursive search to resolve a term string + string as compatible dictionary key and list index combination. It attempts + to resolve the ambiguity for . and "keyword" being either literals or delimiters. + + For example + 'my.dotted.name.a_child_field.somelist[4]' + may be found as + lookup_dict['my.dotted.name']['a_child_field']['somelist'][4] + or found as + lookup_dict['my']['dotted.name']['a_child_field.somelist'][4] + + 1. Prefers longer fieldname matches 2. No wildcards exist within the provided ES search terms (these are treated as string literals) + 3. Firstly assumes 'keyword' is a fieldname, then assumes 'keyword' is a subfield specifier for a multifield - This is necessary to get around inconsistencies in ES data. - - For example: - {'ad.account_name': 'bob'} - Or: - {'csp_report': {'blocked_uri': 'bob.com'}} - And even: - {'juniper_duo.geoip': {'country_name': 'Democratic People's Republic of Korea'}} - - We want a search term of form "key.subkey.subsubkey" to match in all cases. - :returns: A tuple with the first element being the dict that contains the key and the second - element which is the last subkey used to access the target specified by the term. None is - returned for both if the key can not be found. """ - - # For compound fieldnames added by ElastAlert.process_hits() - # - # For example, when query_key is a list of fieldnames it will insert a term - # 'key_1,other_fieldname,a_third_name' - # and if the rule is set for raw_query_keys, the query_key values may end - # with .keyword it will insert instead something like - # 'key_1_ip,other_fieldname_number,a_third_name.keyword' - # and we need to check for that synthentic compound fielname, including the - # .keyword suffix before contnuing - # - # Of course, it also handles happy path, non-ambuiguous fieldnames like - # 'ip_address' and 'src_displayname' that don't have . or [] characters - if term in lookup_dict: - return lookup_dict, term - - # If not synthetically added by ElastAlert, matching documents will not have - # .keyword fieldnames, even if a .keyword fieldname was used as a term in - # the search - # e.g. {"term": {"description.keyword": "Target Description Here"}} - # will return a document with {"_source": {"description": "Target Description Here"}} - term = term.removesuffix(string_multi_field_name) - if term in lookup_dict: - return lookup_dict, term - - # If the term does not match immediately, perform iterative lookup: - # 1. Split the search term into tokens - # 2. Recurrently concatenate these together to traverse deeper into the dictionary, - # clearing the subkey at every successful lookup. - # - # This greedy approach is correct because subkeys must always appear in order, - # preferring full stops and traversal interchangeably. - # - # Subkeys will NEVER be duplicated between an alias and a traversal. - # - # For example: - # {'foo.bar': {'bar': 'ray'}} to look up foo.bar will return {'bar': 'ray'}, not 'ray' - dict_cursor = lookup_dict - - while term: - split_results = re.split(r'\[(\d)\]', term, maxsplit=1) - if len(split_results) == 3: - sub_term, index, term = split_results - index = int(index) - else: - sub_term, index, term = split_results + [None, ''] - - subkeys = sub_term.split('.') - - subkey = '' - - while len(subkeys) > 0: - if not dict_cursor: - return {}, None - - subkey += subkeys.pop(0) - - if subkey in dict_cursor: - if len(subkeys) == 0: - break - dict_cursor = dict_cursor[subkey] - subkey = '' - elif len(subkeys) == 0: - # If there are no keys left to match, return None values - dict_cursor = None - subkey = None - else: - subkey += '.' - - if index is not None and subkey: - dict_cursor = dict_cursor[subkey] - if type(dict_cursor) == list and len(dict_cursor) > index: - subkey = index - if term: - dict_cursor = dict_cursor[subkey] - else: - return {}, None - - return dict_cursor, subkey + subkeys = term.split('.') + + # reverse to match longest fieldnames first + for i in reversed(range(1, len(subkeys)+1)): + root = ".".join(subkeys[0:i]) + + # Handle array index references + # Example + # foo[3]bar[1]baz is recursively checked as + # _find_es_dict_by_key(lookup_dict['foo'][3], 'bar[1]baz') + + m = re.search(r'(.+?)\[(\d)\](.*)', root) + value_index = None + child_components = [] + if m: + root = m.group(1) + value_index = int(m.group(2)) + if m.group(3): + child_components.append(m.group(3)) + + if root in lookup_dict: + child_components.extend(subkeys[i:]) + + # Pursue 'keyword' (if present) as a literal required fieldname + child_components_options = [child_components] + try: + # Then pursue 'keyword' (if present) as subfield specifier by ignoring it + if child_components[-1] == string_multi_field_name: + child_components_options.append(child_components[:-1]) + except IndexError: + pass + + for child_components_option in child_components_options: + child = ".".join(child_components_option) + if value_index is not None: + if not child: + return lookup_dict[root], value_index + if isinstance(lookup_dict[root][value_index], dict): + try: + return _find_es_dict_by_key(lookup_dict[root][value_index], child, string_multi_field_name) + except IndexError: + return {}, None + + if child and isinstance(lookup_dict[root], dict): + return _find_es_dict_by_key(lookup_dict[root], child, string_multi_field_name) + return lookup_dict, root + return {}, None def set_es_key(lookup_dict, term, value): diff --git a/tests/util_test.py b/tests/util_test.py index 32dcbc26..4fd53f64 100644 --- a/tests/util_test.py +++ b/tests/util_test.py @@ -135,6 +135,24 @@ def test_looking_up_nested_composite_keys(ea): assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected +def test_looking_up_nested_composite_keys_with_fieldname_literary_containing_keyword(ea): + expected = 12467267 + record = { + 'Message': '12345', + 'Fields': { + 'ts': { + 'value': { + 'keyword': expected, + } + }, + 'severity': 'large', + 'user': 'jimmay' + } + } + + assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected + + def test_looking_up_arrays(ea): record = { 'flags': [1, 2, 3], From b057e934f27a71bc894841bf26617151f6ead8ef Mon Sep 17 00:00:00 2001 From: James Macdonell Date: Mon, 3 Jun 2024 17:22:38 -0700 Subject: [PATCH 2/2] CHANGELOG for #1459 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 983db0f7..6d71e1a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ ## Other changes - [Docs] Fixed typo in Alerta docs with incorrect number of seconds in a day. - @jertel - Update GitHub actions to avoid running publish workflows on forked branches. - @jertel +- Rewrite `_find_es_dict_by_key` per [discussion #1450](https://github.com/jertel/elastalert2/discussions/1450) for fieldnames literally ending in `.keyword` [#1459](https://github.com/jertel/elastalert2/pull/1459) - @jmacdone @jertel # 2.18.0