diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index a494d32a2b..93d15948ad 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -176,22 +176,45 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: exclusions for specific opinion clusters. """ - document_list = [{"_id": f"o_{id}"} for id in related_ids] + opinion_cluster_pairs = [ + opinion_pair + for opinion_id in related_ids + if ( + opinion_pair := await Opinion.objects.filter(pk=opinion_id) + .values("pk", "cluster_id") + .afirst() + ) + ] + unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs} + + document_list = [ + { + "_id": f'o_{pair["pk"]}', + "routing": pair["cluster_id"], + # Important to match documents in the production cluster + } + for pair in opinion_cluster_pairs + ] or [ + {"_id": f"o_{pk}"} for pk in related_ids + ] # Fallback in case IDs are not found in the database. + # The user might have provided non-existent Opinion IDs. + # This ensures that the query does not raise an error and instead returns + # no results. + more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", fields=more_like_this_fields, like=document_list, - min_term_freq=1, - max_query_terms=12, + min_term_freq=settings.RELATED_MLT_MINTF, + max_query_terms=settings.RELATED_MLT_MAXQT, + min_word_length=settings.RELATED_MLT_MINWL, + max_word_length=settings.RELATED_MLT_MAXWL, + max_doc_freq=settings.RELATED_MLT_MAXDF, + analyzer="search_analyzer_exact", ) # Exclude opinion clusters to which the related IDs to query belong. - cluster_ids_to_exclude = ( - OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids) - .distinct("pk") - .values_list("pk", flat=True) - ) - cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()] + cluster_ids_list = list(unique_clusters) exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)] bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids) return bool_query @@ -1240,7 +1263,7 @@ def build_es_base_query( {"opinion": []}, [], mlt_query, - child_highlighting=False, + child_highlighting=True, api_version=api_version, ) ) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..a1c9d0eeeb 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -166,13 +166,11 @@ async def build_cites_clusters_query( async def build_related_clusters_query( cluster_search: Search, sub_opinion_pks: list[str], - search_params: dict[str, str], ) -> Search: """Build the ES related clusters query based on sub-opinion IDs. :param cluster_search: The Elasticsearch DSL Search object :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried. - :param search_params: A dict of parameters used to form the query. :return: The ES DSL Search object representing the query to find the related clusters. """ @@ -267,10 +265,11 @@ async def es_get_citing_and_related_clusters_with_cache( related_index = citing_index = None if cached_related_clusters is None: related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.extra( - size=settings.RELATED_COUNT, track_total_hits=False + size=settings.RELATED_COUNT, + track_total_hits=False, ) multi_search = multi_search.add(related_query) related_index = response_index diff --git a/cl/search/constants.py b/cl/search/constants.py index 333dfbca6c..f7e76cb8fb 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -110,10 +110,10 @@ "syllabus", ] SEARCH_MLT_OPINION_QUERY_FIELDS = [ - "procedural_history", - "posture", - "syllabus", - "text", + "procedural_history.exact", + "posture.exact", + "syllabus.exact", + "text.exact", ] # ES fields that are used for highlighting diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index b276cfb508..fa187453e6 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2270,6 +2270,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ): @@ -2374,6 +2375,9 @@ def test_more_like_this_opinion(self) -> None: < r.content.decode().index("/opinion/%i/" % expected_second_pk), msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.", ) + # Confirm that results contain a snippet + self.assertIn("plain", r.content.decode()) + # Confirm "related to" cluster legend is within the results' header. h2_element = html.fromstring(r.content.decode()).xpath( '//h2[@id="result-count"]'