diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 2c1f1053c9..96cb01653f 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -176,22 +176,38 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query: exclusions for specific opinion clusters. """ - document_list = [{"_id": f"o_{id}"} for id in related_ids] + opinion_cluster_pairs = [ + opinion_pair + for opinion_id in related_ids + if ( + opinion_pair := await Opinion.objects.filter(pk=opinion_id) + .values("pk", "cluster_id") + .afirst() + ) + ] + unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs} + + document_list = [ + { + "_id": f'o_{opinion_pair["pk"]}', + "routing": opinion_pair["cluster_id"], + } + for opinion_pair in opinion_cluster_pairs + ] more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy() mlt_query = Q( "more_like_this", fields=more_like_this_fields, like=document_list, - min_term_freq=1, - max_query_terms=12, + min_term_freq=settings.RELATED_MLT_MINTF, + max_query_terms=settings.RELATED_MLT_MAXQT, + min_word_length=settings.RELATED_MLT_MINWL, + max_word_length=settings.RELATED_MLT_MAXWL, + max_doc_freq=settings.RELATED_MLT_MAXDF, + analyzer="search_analyzer_exact", ) # Exclude opinion clusters to which the related IDs to query belong. - cluster_ids_to_exclude = ( - OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids) - .distinct("pk") - .values_list("pk", flat=True) - ) - cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()] + cluster_ids_list = list(unique_clusters) exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)] bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids) return bool_query diff --git a/cl/opinion_page/utils.py b/cl/opinion_page/utils.py index 160453bb1f..b135d3b020 100644 --- a/cl/opinion_page/utils.py +++ b/cl/opinion_page/utils.py @@ -166,13 +166,11 @@ async def build_cites_clusters_query( async def build_related_clusters_query( cluster_search: Search, sub_opinion_pks: list[str], - search_params: dict[str, str], ) -> Search: """Build the ES related clusters query based on sub-opinion IDs. :param cluster_search: The Elasticsearch DSL Search object :param sub_opinion_pks: A list of IDs representing sub-opinions to be queried. - :param search_params: A dict of parameters used to form the query. :return: The ES DSL Search object representing the query to find the related clusters. """ @@ -267,11 +265,13 @@ async def es_get_citing_and_related_clusters_with_cache( related_index = citing_index = None if cached_related_clusters is None: related_query = await build_related_clusters_query( - cluster_search, sub_opinion_pks, search_params + cluster_search, sub_opinion_pks ) related_query = related_query.extra( - size=settings.RELATED_COUNT, track_total_hits=False + size=settings.RELATED_COUNT, + track_total_hits=False, ) + print("Related query opinion: ", related_query.to_dict()) multi_search = multi_search.add(related_query) related_index = response_index response_index += 1 diff --git a/cl/search/constants.py b/cl/search/constants.py index 333dfbca6c..f7e76cb8fb 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -110,10 +110,10 @@ "syllabus", ] SEARCH_MLT_OPINION_QUERY_FIELDS = [ - "procedural_history", - "posture", - "syllabus", - "text", + "procedural_history.exact", + "posture.exact", + "syllabus.exact", + "text.exact", ] # ES fields that are used for highlighting diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index c7d9c2568d..4996f7d985 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2253,6 +2253,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +@override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase ):