Skip to content

Commit

Permalink
Merge pull request #4735 from freelawproject/4305-fix-es-mlt-query
Browse files Browse the repository at this point in the history
4305 Fixed ES More Like This query
  • Loading branch information
albertisfu authored Nov 28, 2024
2 parents 78f7de9 + 5ae668a commit 67b1fc2
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 18 deletions.
43 changes: 33 additions & 10 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,22 +176,45 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
exclusions for specific opinion clusters.
"""

document_list = [{"_id": f"o_{id}"} for id in related_ids]
opinion_cluster_pairs = [
opinion_pair
for opinion_id in related_ids
if (
opinion_pair := await Opinion.objects.filter(pk=opinion_id)
.values("pk", "cluster_id")
.afirst()
)
]
unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs}

document_list = [
{
"_id": f'o_{pair["pk"]}',
"routing": pair["cluster_id"],
# Important to match documents in the production cluster
}
for pair in opinion_cluster_pairs
] or [
{"_id": f"o_{pk}"} for pk in related_ids
] # Fallback in case IDs are not found in the database.
# The user might have provided non-existent Opinion IDs.
# This ensures that the query does not raise an error and instead returns
# no results.

more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
mlt_query = Q(
"more_like_this",
fields=more_like_this_fields,
like=document_list,
min_term_freq=1,
max_query_terms=12,
min_term_freq=settings.RELATED_MLT_MINTF,
max_query_terms=settings.RELATED_MLT_MAXQT,
min_word_length=settings.RELATED_MLT_MINWL,
max_word_length=settings.RELATED_MLT_MAXWL,
max_doc_freq=settings.RELATED_MLT_MAXDF,
analyzer="search_analyzer_exact",
)
# Exclude opinion clusters to which the related IDs to query belong.
cluster_ids_to_exclude = (
OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
.distinct("pk")
.values_list("pk", flat=True)
)
cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
cluster_ids_list = list(unique_clusters)
exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
return bool_query
Expand Down Expand Up @@ -1240,7 +1263,7 @@ def build_es_base_query(
{"opinion": []},
[],
mlt_query,
child_highlighting=False,
child_highlighting=True,
api_version=api_version,
)
)
Expand Down
7 changes: 3 additions & 4 deletions cl/opinion_page/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,11 @@ async def build_cites_clusters_query(
async def build_related_clusters_query(
cluster_search: Search,
sub_opinion_pks: list[str],
search_params: dict[str, str],
) -> Search:
"""Build the ES related clusters query based on sub-opinion IDs.
:param cluster_search: The Elasticsearch DSL Search object
:param sub_opinion_pks: A list of IDs representing sub-opinions to be queried.
:param search_params: A dict of parameters used to form the query.
:return: The ES DSL Search object representing the query to find the
related clusters.
"""
Expand Down Expand Up @@ -267,10 +265,11 @@ async def es_get_citing_and_related_clusters_with_cache(
related_index = citing_index = None
if cached_related_clusters is None:
related_query = await build_related_clusters_query(
cluster_search, sub_opinion_pks, search_params
cluster_search, sub_opinion_pks
)
related_query = related_query.extra(
size=settings.RELATED_COUNT, track_total_hits=False
size=settings.RELATED_COUNT,
track_total_hits=False,
)
multi_search = multi_search.add(related_query)
related_index = response_index
Expand Down
8 changes: 4 additions & 4 deletions cl/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@
"syllabus",
]
SEARCH_MLT_OPINION_QUERY_FIELDS = [
"procedural_history",
"posture",
"syllabus",
"text",
"procedural_history.exact",
"posture.exact",
"syllabus.exact",
"text.exact",
]

# ES fields that are used for highlighting
Expand Down
4 changes: 4 additions & 0 deletions cl/search/tests/tests_es_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None:
cluster_2.delete()


@override_settings(RELATED_MLT_MINTF=1)
class RelatedSearchTest(
ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase
):
Expand Down Expand Up @@ -2374,6 +2375,9 @@ def test_more_like_this_opinion(self) -> None:
< r.content.decode().index("/opinion/%i/" % expected_second_pk),
msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.",
)
# Confirm that results contain a snippet
self.assertIn("<mark>plain</mark>", r.content.decode())

# Confirm "related to" cluster legend is within the results' header.
h2_element = html.fromstring(r.content.decode()).xpath(
'//h2[@id="result-count"]'
Expand Down

0 comments on commit 67b1fc2

Please sign in to comment.