Skip to content

Commit

Permalink
fix(elasticsearch): Fixed ES MLT query
Browse files Browse the repository at this point in the history
Fixes: #4305
  • Loading branch information
albertisfu committed Nov 26, 2024
1 parent 65f59cb commit 2741121
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 17 deletions.
34 changes: 25 additions & 9 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,22 +176,38 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
exclusions for specific opinion clusters.
"""

document_list = [{"_id": f"o_{id}"} for id in related_ids]
opinion_cluster_pairs = [
opinion_pair
for opinion_id in related_ids
if (
opinion_pair := await Opinion.objects.filter(pk=opinion_id)
.values("pk", "cluster_id")
.afirst()
)
]
unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs}

document_list = [
{
"_id": f'o_{opinion_pair["pk"]}',
"routing": opinion_pair["cluster_id"],
}
for opinion_pair in opinion_cluster_pairs
]
more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
mlt_query = Q(
"more_like_this",
fields=more_like_this_fields,
like=document_list,
min_term_freq=1,
max_query_terms=12,
min_term_freq=settings.RELATED_MLT_MINTF,
max_query_terms=settings.RELATED_MLT_MAXQT,
min_word_length=settings.RELATED_MLT_MINWL,
max_word_length=settings.RELATED_MLT_MAXWL,
max_doc_freq=settings.RELATED_MLT_MAXDF,
analyzer="search_analyzer_exact",
)
# Exclude opinion clusters to which the related IDs to query belong.
cluster_ids_to_exclude = (
OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
.distinct("pk")
.values_list("pk", flat=True)
)
cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
cluster_ids_list = list(unique_clusters)
exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
return bool_query
Expand Down
8 changes: 4 additions & 4 deletions cl/opinion_page/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,11 @@ async def build_cites_clusters_query(
async def build_related_clusters_query(
cluster_search: Search,
sub_opinion_pks: list[str],
search_params: dict[str, str],
) -> Search:
"""Build the ES related clusters query based on sub-opinion IDs.
:param cluster_search: The Elasticsearch DSL Search object
:param sub_opinion_pks: A list of IDs representing sub-opinions to be queried.
:param search_params: A dict of parameters used to form the query.
:return: The ES DSL Search object representing the query to find the
related clusters.
"""
Expand Down Expand Up @@ -267,11 +265,13 @@ async def es_get_citing_and_related_clusters_with_cache(
related_index = citing_index = None
if cached_related_clusters is None:
related_query = await build_related_clusters_query(
cluster_search, sub_opinion_pks, search_params
cluster_search, sub_opinion_pks
)
related_query = related_query.extra(
size=settings.RELATED_COUNT, track_total_hits=False
size=settings.RELATED_COUNT,
track_total_hits=False,
)
print("Related query opinion: ", related_query.to_dict())
multi_search = multi_search.add(related_query)
related_index = response_index
response_index += 1
Expand Down
8 changes: 4 additions & 4 deletions cl/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@
"syllabus",
]
SEARCH_MLT_OPINION_QUERY_FIELDS = [
"procedural_history",
"posture",
"syllabus",
"text",
"procedural_history.exact",
"posture.exact",
"syllabus.exact",
"text.exact",
]

# ES fields that are used for highlighting
Expand Down
1 change: 1 addition & 0 deletions cl/search/tests/tests_es_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2253,6 +2253,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None:
cluster_2.delete()


@override_settings(RELATED_MLT_MINTF=1)
class RelatedSearchTest(
ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase
):
Expand Down

0 comments on commit 2741121

Please sign in to comment.