From 274112171af31e2d56f7c6b7f9607b8cb770b0d7 Mon Sep 17 00:00:00 2001
From: Alberto Islas <albertisfu@gmail.com>
Date: Tue, 26 Nov 2024 11:11:19 -0600
Subject: [PATCH 1/5] fix(elasticsearch): Fixed ES MLT query

Fixes: #4305
---
 cl/lib/elasticsearch_utils.py       | 34 +++++++++++++++++++++--------
 cl/opinion_page/utils.py            |  8 +++----
 cl/search/constants.py              |  8 +++----
 cl/search/tests/tests_es_opinion.py |  1 +
 4 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index 2c1f1053c9..96cb01653f 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -176,22 +176,38 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
     exclusions for specific opinion clusters.
     """
 
-    document_list = [{"_id": f"o_{id}"} for id in related_ids]
+    opinion_cluster_pairs = [
+        opinion_pair
+        for opinion_id in related_ids
+        if (
+            opinion_pair := await Opinion.objects.filter(pk=opinion_id)
+            .values("pk", "cluster_id")
+            .afirst()
+        )
+    ]
+    unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs}
+
+    document_list = [
+        {
+            "_id": f'o_{opinion_pair["pk"]}',
+            "routing": opinion_pair["cluster_id"],
+        }
+        for opinion_pair in opinion_cluster_pairs
+    ]
     more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
     mlt_query = Q(
         "more_like_this",
         fields=more_like_this_fields,
         like=document_list,
-        min_term_freq=1,
-        max_query_terms=12,
+        min_term_freq=settings.RELATED_MLT_MINTF,
+        max_query_terms=settings.RELATED_MLT_MAXQT,
+        min_word_length=settings.RELATED_MLT_MINWL,
+        max_word_length=settings.RELATED_MLT_MAXWL,
+        max_doc_freq=settings.RELATED_MLT_MAXDF,
+        analyzer="search_analyzer_exact",
     )
     # Exclude opinion clusters to which the related IDs to query belong.
-    cluster_ids_to_exclude = (
-        OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
-        .distinct("pk")
-        .values_list("pk", flat=True)
-    )
-    cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
+    cluster_ids_list = list(unique_clusters)
     exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
     bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
     return bool_query
diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py
index 160453bb1f..b135d3b020 100644
--- a/cl/opinion_page/utils.py
+++ b/cl/opinion_page/utils.py
@@ -166,13 +166,11 @@ async def build_cites_clusters_query(
 async def build_related_clusters_query(
     cluster_search: Search,
     sub_opinion_pks: list[str],
-    search_params: dict[str, str],
 ) -> Search:
     """Build the ES related clusters query based on sub-opinion IDs.
 
     :param cluster_search: The Elasticsearch DSL Search object
     :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried.
-    :param search_params: A dict of parameters used to form the query.
     :return: The ES DSL Search object representing the query to find the
     related clusters.
     """
@@ -267,11 +265,13 @@ async def es_get_citing_and_related_clusters_with_cache(
     related_index = citing_index = None
     if cached_related_clusters is None:
         related_query = await build_related_clusters_query(
-            cluster_search, sub_opinion_pks, search_params
+            cluster_search, sub_opinion_pks
         )
         related_query = related_query.extra(
-            size=settings.RELATED_COUNT, track_total_hits=False
+            size=settings.RELATED_COUNT,
+            track_total_hits=False,
         )
+        print("Related query opinion: ", related_query.to_dict())
         multi_search = multi_search.add(related_query)
         related_index = response_index
         response_index += 1
diff --git a/cl/search/constants.py b/cl/search/constants.py
index 333dfbca6c..f7e76cb8fb 100644
--- a/cl/search/constants.py
+++ b/cl/search/constants.py
@@ -110,10 +110,10 @@
     "syllabus",
 ]
 SEARCH_MLT_OPINION_QUERY_FIELDS = [
-    "procedural_history",
-    "posture",
-    "syllabus",
-    "text",
+    "procedural_history.exact",
+    "posture.exact",
+    "syllabus.exact",
+    "text.exact",
 ]
 
 # ES fields that are used for highlighting
diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py
index c7d9c2568d..4996f7d985 100644
--- a/cl/search/tests/tests_es_opinion.py
+++ b/cl/search/tests/tests_es_opinion.py
@@ -2253,6 +2253,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None:
         cluster_2.delete()
 
 
+@override_settings(RELATED_MLT_MINTF=1)
 class RelatedSearchTest(
     ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase
 ):

From d8b72b08aea225cb7f1e9cb1ee1f1f114349a1f5 Mon Sep 17 00:00:00 2001
From: Alberto Islas <albertisfu@gmail.com>
Date: Tue, 26 Nov 2024 11:34:20 -0600
Subject: [PATCH 2/5] fix(elasticsearch): Added a fallback to the MLT query in
 case the IDs are not found in the DB

---
 cl/lib/elasticsearch_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index 96cb01653f..f0a88ce0ea 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -189,11 +189,15 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
 
     document_list = [
         {
-            "_id": f'o_{opinion_pair["pk"]}',
-            "routing": opinion_pair["cluster_id"],
+            "_id": f'o_{pair["pk"]}',
+            "routing": pair["cluster_id"],
+            # Important to match documents in the production cluster
         }
-        for opinion_pair in opinion_cluster_pairs
-    ]
+        for pair in opinion_cluster_pairs
+    ] or [
+        {"_id": f"o_{pk}"} for pk in related_ids
+    ]  # Fall back in case IDs are not found in DB.
+
     more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
     mlt_query = Q(
         "more_like_this",

From 00885f3e6e84243d99b35830e346cbc866a0a6d4 Mon Sep 17 00:00:00 2001
From: Alberto Islas <albertisfu@gmail.com>
Date: Tue, 26 Nov 2024 13:17:51 -0600
Subject: [PATCH 3/5] fix(elasticsearch): Removed stray print

---
 cl/opinion_page/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py
index b135d3b020..a1c9d0eeeb 100644
--- a/cl/opinion_page/utils.py
+++ b/cl/opinion_page/utils.py
@@ -271,7 +271,6 @@ async def es_get_citing_and_related_clusters_with_cache(
             size=settings.RELATED_COUNT,
             track_total_hits=False,
         )
-        print("Related query opinion: ", related_query.to_dict())
         multi_search = multi_search.add(related_query)
         related_index = response_index
         response_index += 1

From fc3a2c727d42f702aa7fc3df860475bc6b6bb0e9 Mon Sep 17 00:00:00 2001
From: Alberto Islas <albertisfu@gmail.com>
Date: Wed, 27 Nov 2024 10:40:20 -0600
Subject: [PATCH 4/5] fix(elasticsearch): Enabled child highlighting for the
 related: query

---
 cl/lib/elasticsearch_utils.py       | 2 +-
 cl/search/tests/tests_es_opinion.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index f0a88ce0ea..f7dbb19708 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -1260,7 +1260,7 @@ def build_es_base_query(
                         {"opinion": []},
                         [],
                         mlt_query,
-                        child_highlighting=False,
+                        child_highlighting=True,
                         api_version=api_version,
                     )
                 )
diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py
index 4996f7d985..5266c76ff8 100644
--- a/cl/search/tests/tests_es_opinion.py
+++ b/cl/search/tests/tests_es_opinion.py
@@ -2358,6 +2358,9 @@ def test_more_like_this_opinion(self) -> None:
             < r.content.decode().index("/opinion/%i/" % expected_second_pk),
             msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.",
         )
+        # Confirm that results contain a snippet
+        self.assertIn("<mark>plain</mark>", r.content.decode())
+
         # Confirm "related to" cluster legend is within the results' header.
         h2_element = html.fromstring(r.content.decode()).xpath(
             '//h2[@id="result-count"]'

From 62bdf183658933aba4c70b8c3e8c4e5fe5a8d2e7 Mon Sep 17 00:00:00 2001
From: Alberto Islas <albertisfu@gmail.com>
Date: Thu, 28 Nov 2024 11:39:58 -0600
Subject: [PATCH 5/5] fix(elasticsearch): Improved comment in
 build_more_like_this_query

---
 cl/lib/elasticsearch_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
index f7dbb19708..3d0b2c7fa2 100644
--- a/cl/lib/elasticsearch_utils.py
+++ b/cl/lib/elasticsearch_utils.py
@@ -196,7 +196,10 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
         for pair in opinion_cluster_pairs
     ] or [
         {"_id": f"o_{pk}"} for pk in related_ids
-    ]  # Fall back in case IDs are not found in DB.
+    ]  # Fallback in case IDs are not found in the database.
+    # The user might have provided non-existent Opinion IDs.
+    # This ensures that the query does not raise an error and instead returns
+    # no results.
 
     more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
     mlt_query = Q(