freelawproject · albertisfu · Nov 28, 2024 · Nov 26, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py
@@ -176,22 +176,45 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
     exclusions for specific opinion clusters.
     """
 
-    document_list = [{"_id": f"o_{id}"} for id in related_ids]
+    opinion_cluster_pairs = [
+        opinion_pair
+        for opinion_id in related_ids
+        if (
+            opinion_pair := await Opinion.objects.filter(pk=opinion_id)
+            .values("pk", "cluster_id")
+            .afirst()
+        )
+    ]
+    unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs}
+
+    document_list = [
+        {
+            "_id": f'o_{pair["pk"]}',
+            "routing": pair["cluster_id"],
+            # Important to match documents in the production cluster
+        }
+        for pair in opinion_cluster_pairs
+    ] or [
+        {"_id": f"o_{pk}"} for pk in related_ids
+    ]  # Fallback in case IDs are not found in the database.
+    # The user might have provided non-existent Opinion IDs.
+    # This ensures that the query does not raise an error and instead returns
+    # no results.
+
     more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
     mlt_query = Q(
         "more_like_this",
         fields=more_like_this_fields,
         like=document_list,
-        min_term_freq=1,
-        max_query_terms=12,
+        min_term_freq=settings.RELATED_MLT_MINTF,
+        max_query_terms=settings.RELATED_MLT_MAXQT,
+        min_word_length=settings.RELATED_MLT_MINWL,
+        max_word_length=settings.RELATED_MLT_MAXWL,
+        max_doc_freq=settings.RELATED_MLT_MAXDF,
+        analyzer="search_analyzer_exact",
     )
     # Exclude opinion clusters to which the related IDs to query belong.
-    cluster_ids_to_exclude = (
-        OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
-        .distinct("pk")
-        .values_list("pk", flat=True)
-    )
-    cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
+    cluster_ids_list = list(unique_clusters)
     exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
     bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
     return bool_query
@@ -1240,7 +1263,7 @@ def build_es_base_query(
                         {"opinion": []},
                         [],
                         mlt_query,
-                        child_highlighting=False,
+                        child_highlighting=True,
                         api_version=api_version,
                     )
                 )

diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py
@@ -166,13 +166,11 @@ async def build_cites_clusters_query(
 async def build_related_clusters_query(
     cluster_search: Search,
     sub_opinion_pks: list[str],
-    search_params: dict[str, str],
 ) -> Search:
     """Build the ES related clusters query based on sub-opinion IDs.
 
     :param cluster_search: The Elasticsearch DSL Search object
     :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried.
-    :param search_params: A dict of parameters used to form the query.
     :return: The ES DSL Search object representing the query to find the
     related clusters.
     """
@@ -267,10 +265,11 @@ async def es_get_citing_and_related_clusters_with_cache(
     related_index = citing_index = None
     if cached_related_clusters is None:
         related_query = await build_related_clusters_query(
-            cluster_search, sub_opinion_pks, search_params
+            cluster_search, sub_opinion_pks
         )
         related_query = related_query.extra(
-            size=settings.RELATED_COUNT, track_total_hits=False
+            size=settings.RELATED_COUNT,
+            track_total_hits=False,
         )
         multi_search = multi_search.add(related_query)
         related_index = response_index

diff --git a/cl/search/constants.py b/cl/search/constants.py
@@ -110,10 +110,10 @@
     "syllabus",
 ]
 SEARCH_MLT_OPINION_QUERY_FIELDS = [
-    "procedural_history",
-    "posture",
-    "syllabus",
-    "text",
+    "procedural_history.exact",
+    "posture.exact",
+    "syllabus.exact",
+    "text.exact",
 ]
 
 # ES fields that are used for highlighting

diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py
@@ -2270,6 +2270,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None:
         cluster_2.delete()
 
 
+@override_settings(RELATED_MLT_MINTF=1)
 class RelatedSearchTest(
     ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase
 ):
@@ -2374,6 +2375,9 @@ def test_more_like_this_opinion(self) -> None:
             < r.content.decode().index("/opinion/%i/" % expected_second_pk),
             msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.",
         )
+        # Confirm that results contain a snippet
+        self.assertIn("<mark>plain</mark>", r.content.decode())
+
         # Confirm "related to" cluster legend is within the results' header.
         h2_element = html.fromstring(r.content.decode()).xpath(
             '//h2[@id="result-count"]'