From 3d1d47deb949b37a2c89ea47db1798fbef588083 Mon Sep 17 00:00:00 2001
From: James Macdonell <jmacdone@csusb.edu>
Date: Thu, 30 May 2024 11:05:14 -0700
Subject: [PATCH 1/2] Recursive _find_es_dict_by_key #1450

Significant rewrite.  Attempting to better handle cases where keyword is actually
part of the fieldname.
---
 elastalert/util.py | 157 ++++++++++++++++++---------------------------
 tests/util_test.py |  18 ++++++
 2 files changed, 79 insertions(+), 96 deletions(-)

diff --git a/elastalert/util.py b/elastalert/util.py
index 04c0f274..8589e3a5 100644
--- a/elastalert/util.py
+++ b/elastalert/util.py
@@ -44,105 +44,70 @@ def new_get_event_ts(ts_field):
     return lambda event: lookup_es_key(event[0], ts_field)
 
 
-def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = ".keyword") -> tuple[dict, str]:
-    """ Performs iterative dictionary search based upon the following conditions:
-
-    1. Subkeys may either appear behind a full stop (.) or at one lookup_dict level lower in the tree.
+def _find_es_dict_by_key(lookup_dict: dict, term: str, string_multi_field_name: str = "keyword") -> tuple[dict, str]:
+    """ Performs a divide-and-conquer recursive search to resolve a term string
+    string as compatible dictionary key and list index combination.  It attempts
+    to resolve the ambiguity for . and "keyword" being either literals or delimiters.
+
+    For example
+       'my.dotted.name.a_child_field.somelist[4]'
+    may be found as
+        lookup_dict['my.dotted.name']['a_child_field']['somelist'][4]
+    or found as
+        lookup_dict['my']['dotted.name']['a_child_field.somelist'][4]
+
+    1. Prefers longer fieldname matches
     2. No wildcards exist within the provided ES search terms (these are treated as string literals)
+    3. Firstly assumes 'keyword' is a fieldname, then assumes 'keyword' is a subfield specifier for a multifield
 
-    This is necessary to get around inconsistencies in ES data.
-
-    For example:
-      {'ad.account_name': 'bob'}
-    Or:
-      {'csp_report': {'blocked_uri': 'bob.com'}}
-    And even:
-       {'juniper_duo.geoip': {'country_name': 'Democratic People's Republic of Korea'}}
-
-    We want a search term of form "key.subkey.subsubkey" to match in all cases.
-    :returns: A tuple with the first element being the dict that contains the key and the second
-    element which is the last subkey used to access the target specified by the term. None is
-    returned for both if the key can not be found.
     """
-
-    # For compound fieldnames added by ElastAlert.process_hits()
-    #
-    # For example, when query_key is a list of fieldnames it will insert a term
-    #     'key_1,other_fieldname,a_third_name'
-    # and if the rule is set for raw_query_keys, the query_key values may end
-    # with .keyword it will insert instead something like
-    #     'key_1_ip,other_fieldname_number,a_third_name.keyword'
-    # and we need to check for that synthentic compound fielname, including the
-    # .keyword suffix before contnuing
-    #
-    # Of course, it also handles happy path, non-ambuiguous fieldnames like
-    # 'ip_address' and 'src_displayname' that don't have . or [] characters
-    if term in lookup_dict:
-        return lookup_dict, term
-
-    # If not synthetically added by ElastAlert, matching documents will not have
-    # .keyword fieldnames, even if a .keyword fieldname was used as a term in
-    # the search
-    # e.g. {"term": {"description.keyword": "Target Description Here"}}
-    # will return a document with {"_source": {"description": "Target Description Here"}}
-    term = term.removesuffix(string_multi_field_name)
-    if term in lookup_dict:
-        return lookup_dict, term
-
-    # If the term does not match immediately, perform iterative lookup:
-    # 1. Split the search term into tokens
-    # 2. Recurrently concatenate these together to traverse deeper into the dictionary,
-    #    clearing the subkey at every successful lookup.
-    #
-    # This greedy approach is correct because subkeys must always appear in order,
-    # preferring full stops and traversal interchangeably.
-    #
-    # Subkeys will NEVER be duplicated between an alias and a traversal.
-    #
-    # For example:
-    #  {'foo.bar': {'bar': 'ray'}} to look up foo.bar will return {'bar': 'ray'}, not 'ray'
-    dict_cursor = lookup_dict
-
-    while term:
-        split_results = re.split(r'\[(\d)\]', term, maxsplit=1)
-        if len(split_results) == 3:
-            sub_term, index, term = split_results
-            index = int(index)
-        else:
-            sub_term, index, term = split_results + [None, '']
-
-        subkeys = sub_term.split('.')
-
-        subkey = ''
-
-        while len(subkeys) > 0:
-            if not dict_cursor:
-                return {}, None
-
-            subkey += subkeys.pop(0)
-
-            if subkey in dict_cursor:
-                if len(subkeys) == 0:
-                    break
-                dict_cursor = dict_cursor[subkey]
-                subkey = ''
-            elif len(subkeys) == 0:
-                # If there are no keys left to match, return None values
-                dict_cursor = None
-                subkey = None
-            else:
-                subkey += '.'
-
-        if index is not None and subkey:
-            dict_cursor = dict_cursor[subkey]
-            if type(dict_cursor) == list and len(dict_cursor) > index:
-                subkey = index
-                if term:
-                    dict_cursor = dict_cursor[subkey]
-            else:
-                return {}, None
-
-    return dict_cursor, subkey
+    subkeys = term.split('.')
+
+    # reverse to match longest fieldnames first
+    for i in reversed(range(1, len(subkeys)+1)):
+        root = ".".join(subkeys[0:i])
+
+        # Handle array index references
+        # Example
+        # foo[3]bar[1]baz is recursively checked as
+        # _find_es_dict_by_key(lookup_dict['foo'][3], 'bar[1]baz')
+
+        m = re.search(r'(.+?)\[(\d)\](.*)', root)
+        value_index = None
+        child_components = []
+        if m:
+            root = m.group(1)
+            value_index = int(m.group(2))
+            if m.group(3):
+                child_components.append(m.group(3))
+
+        if root in lookup_dict:
+            child_components.extend(subkeys[i:])
+
+            # Pursue 'keyword' (if present) as a literal required fieldname
+            child_components_options = [child_components]
+            try:
+                # Then pursue 'keyword' (if present) as subfield specifier by ignoring it
+                if child_components[-1] == string_multi_field_name:
+                    child_components_options.append(child_components[:-1])
+            except IndexError:
+                pass
+
+            for child_components_option in child_components_options:
+                child = ".".join(child_components_option)
+                if value_index is not None:
+                    if not child:
+                        return lookup_dict[root], value_index
+                    if isinstance(lookup_dict[root][value_index], dict):
+                        try:
+                            return _find_es_dict_by_key(lookup_dict[root][value_index], child, string_multi_field_name)
+                        except IndexError:
+                            return {}, None
+
+                if child and isinstance(lookup_dict[root], dict):
+                    return _find_es_dict_by_key(lookup_dict[root], child, string_multi_field_name)
+                return lookup_dict, root
+    return {}, None
 
 
 def set_es_key(lookup_dict, term, value):
diff --git a/tests/util_test.py b/tests/util_test.py
index 32dcbc26..4fd53f64 100644
--- a/tests/util_test.py
+++ b/tests/util_test.py
@@ -135,6 +135,24 @@ def test_looking_up_nested_composite_keys(ea):
     assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected
 
 
+def test_looking_up_nested_composite_keys_with_fieldname_literary_containing_keyword(ea):
+    expected = 12467267
+    record = {
+        'Message': '12345',
+        'Fields': {
+            'ts': {
+                'value': {
+                    'keyword': expected,
+                }
+            },
+            'severity': 'large',
+            'user': 'jimmay'
+        }
+    }
+
+    assert lookup_es_key(record, 'Fields.ts.value.keyword') == expected
+
+
 def test_looking_up_arrays(ea):
     record = {
         'flags': [1, 2, 3],

From b057e934f27a71bc894841bf26617151f6ead8ef Mon Sep 17 00:00:00 2001
From: James Macdonell <jmacdone@csusb.edu>
Date: Mon, 3 Jun 2024 17:22:38 -0700
Subject: [PATCH 2/2] CHANGELOG for #1459

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 983db0f7..6d71e1a1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
 ## Other changes
 - [Docs] Fixed typo in Alerta docs with incorrect number of seconds in a day. - @jertel
 - Update GitHub actions to avoid running publish workflows on forked branches. - @jertel
+- Rewrite `_find_es_dict_by_key` per [discussion #1450](https://github.com/jertel/elastalert2/discussions/1450) for fieldnames literally ending in `.keyword` [#1459](https://github.com/jertel/elastalert2/pull/1459) - @jmacdone @jertel
 
 # 2.18.0