From 274112171af31e2d56f7c6b7f9607b8cb770b0d7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 11:11:19 -0600 Subject: [PATCH 1/5] fix(elasticsearch): Fixed ES MLT query Fixes: #4305 --- cl/lib/elasticsearch_utils.py | 34 +++++++++++++++++++++-------- cl/opinion_page/utils.py | 8 +++---- cl/search/constants.py | 8 +++---- cl/search/tests/tests_es_opinion.py | 1 + 4 files changed, 34 insertions(+), 17 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 2c1f1053c9..96cb01653f 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -176,22 +176,38 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: exclusions for specific opinion clusters. """ - document_list = [{"_id": f"o_{id}"} for id in related_ids] + opinion_cluster_pairs = [ + opinion_pair + for opinion_id in related_ids + if ( + opinion_pair := await Opinion.objects.filter(pk=opinion_id) + .values("pk", "cluster_id") + .afirst() + ) + ] + unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs} + + document_list = [ + { + "_id": f'o_{opinion_pair["pk"]}', + "routing": opinion_pair["cluster_id"], + } + for opinion_pair in opinion_cluster_pairs + ] more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", fields=more_like_this_fields, like=document_list, - min_term_freq=1, - max_query_terms=12, + min_term_freq=settings.RELATED_MLT_MINTF, + max_query_terms=settings.RELATED_MLT_MAXQT, + min_word_length=settings.RELATED_MLT_MINWL, + max_word_length=settings.RELATED_MLT_MAXWL, + max_doc_freq=settings.RELATED_MLT_MAXDF, + analyzer="search_analyzer_exact", ) # Exclude opinion clusters to which the related IDs to query belong. - cluster_ids_to_exclude = ( - OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids) - .distinct("pk") - .values_list("pk", flat=True) - ) - cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()] + cluster_ids_list = list(unique_clusters) exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)] bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids) return bool_query diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..b135d3b020 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -166,13 +166,11 @@ async def build_cites_clusters_query( async def build_related_clusters_query( cluster_search: Search, sub_opinion_pks: list[str], - search_params: dict[str, str], ) -> Search: """Build the ES related clusters query based on sub-opinion IDs. :param cluster_search: The Elasticsearch DSL Search object :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried. - :param search_params: A dict of parameters used to form the query. :return: The ES DSL Search object representing the query to find the related clusters. """ @@ -267,11 +265,13 @@ async def es_get_citing_and_related_clusters_with_cache( related_index = citing_index = None if cached_related_clusters is None: related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.extra( - size=settings.RELATED_COUNT, track_total_hits=False + size=settings.RELATED_COUNT, + track_total_hits=False, ) + print("Related query opinion: ", related_query.to_dict()) multi_search = multi_search.add(related_query) related_index = response_index response_index += 1 diff --git a/cl/search/constants.py b/cl/search/constants.py index 333dfbca6c..f7e76cb8fb 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -110,10 +110,10 @@ "syllabus", ] SEARCH_MLT_OPINION_QUERY_FIELDS = [ - "procedural_history", - "posture", - "syllabus", - "text", + "procedural_history.exact", + "posture.exact", + "syllabus.exact", + "text.exact", ] # ES fields that are used for highlighting diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index c7d9c2568d..4996f7d985 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2253,6 +2253,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ): From d8b72b08aea225cb7f1e9cb1ee1f1f114349a1f5 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 11:34:20 -0600 Subject: [PATCH 2/5] fix(elasticsearch): Added a fallback to the MLT query in case the IDs are not found in the DB --- cl/lib/elasticsearch_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 96cb01653f..f0a88ce0ea 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -189,11 +189,15 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: document_list = [ { - "_id": f'o_{opinion_pair["pk"]}', - "routing": opinion_pair["cluster_id"], + "_id": f'o_{pair["pk"]}', + "routing": pair["cluster_id"], + # Important to match documents in the production cluster } - for opinion_pair in opinion_cluster_pairs - ] + for pair in opinion_cluster_pairs + ] or [ + {"_id": f"o_{pk}"} for pk in related_ids + ] # Fall back in case IDs are not found in DB. + more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", From 00885f3e6e84243d99b35830e346cbc866a0a6d4 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 26 Nov 2024 13:17:51 -0600 Subject: [PATCH 3/5] fix(elasticsearch): Removed stray print --- cl/opinion_page/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index b135d3b020..a1c9d0eeeb 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -271,7 +271,6 @@ async def es_get_citing_and_related_clusters_with_cache( size=settings.RELATED_COUNT, track_total_hits=False, ) - print("Related query opinion: ", related_query.to_dict()) multi_search = multi_search.add(related_query) related_index = response_index response_index += 1 From fc3a2c727d42f702aa7fc3df860475bc6b6bb0e9 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 27 Nov 2024 10:40:20 -0600 Subject: [PATCH 4/5] fix(elasticsearch): Enabled child highlighting for the related: query --- cl/lib/elasticsearch_utils.py | 2 +- cl/search/tests/tests_es_opinion.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index f0a88ce0ea..f7dbb19708 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -1260,7 +1260,7 @@ def build_es_base_query( {"opinion": []}, [], mlt_query, - child_highlighting=False, + child_highlighting=True, api_version=api_version, ) ) diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 4996f7d985..5266c76ff8 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2358,6 +2358,9 @@ def test_more_like_this_opinion(self) -> None: < r.content.decode().index("/opinion/%i/" % expected_second_pk), msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.", ) + # Confirm that results contain a snippet + self.assertIn("plain", r.content.decode()) + # Confirm "related to" cluster legend is within the results' header. h2_element = html.fromstring(r.content.decode()).xpath( '//h2[@id="result-count"]' From 62bdf183658933aba4c70b8c3e8c4e5fe5a8d2e7 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 28 Nov 2024 11:39:58 -0600 Subject: [PATCH 5/5] fix(elasticsearch): Improved comment in build_more_like_this_query --- cl/lib/elasticsearch_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index f7dbb19708..3d0b2c7fa2 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -196,7 +196,10 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: for pair in opinion_cluster_pairs ] or [ {"_id": f"o_{pk}"} for pk in related_ids - ] # Fall back in case IDs are not found in DB. + ] # Fallback in case IDs are not found in the database. + # The user might have provided non-existent Opinion IDs. + # This ensures that the query does not raise an error and instead returns + # no results. more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q(