Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

4305 Fixed ES More Like This query #4735

Merged
merged 8 commits into from
Nov 28, 2024
43 changes: 33 additions & 10 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,22 +176,45 @@ async def build_more_like_this_query(related_ids: list[str]) -> Query:
exclusions for specific opinion clusters.
"""

document_list = [{"_id": f"o_{id}"} for id in related_ids]
opinion_cluster_pairs = [
opinion_pair
for opinion_id in related_ids
if (
opinion_pair := await Opinion.objects.filter(pk=opinion_id)
.values("pk", "cluster_id")
.afirst()
)
]
unique_clusters = {pair["cluster_id"] for pair in opinion_cluster_pairs}

document_list = [
{
"_id": f'o_{pair["pk"]}',
"routing": pair["cluster_id"],
# Important to match documents in the production cluster
}
for pair in opinion_cluster_pairs
] or [
{"_id": f"o_{pk}"} for pk in related_ids
] # Fallback in case IDs are not found in the database.
# The user might have provided non-existent Opinion IDs.
# This ensures that the query does not raise an error and instead returns
# no results.

more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
mlt_query = Q(
"more_like_this",
fields=more_like_this_fields,
like=document_list,
min_term_freq=1,
max_query_terms=12,
min_term_freq=settings.RELATED_MLT_MINTF,
max_query_terms=settings.RELATED_MLT_MAXQT,
min_word_length=settings.RELATED_MLT_MINWL,
max_word_length=settings.RELATED_MLT_MAXWL,
max_doc_freq=settings.RELATED_MLT_MAXDF,
analyzer="search_analyzer_exact",
)
# Exclude opinion clusters to which the related IDs to query belong.
cluster_ids_to_exclude = (
OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
.distinct("pk")
.values_list("pk", flat=True)
)
cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
cluster_ids_list = list(unique_clusters)
exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
return bool_query
Expand Down Expand Up @@ -1240,7 +1263,7 @@ def build_es_base_query(
{"opinion": []},
[],
mlt_query,
child_highlighting=False,
child_highlighting=True,
api_version=api_version,
)
)
Expand Down
7 changes: 3 additions & 4 deletions cl/opinion_page/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,13 +166,11 @@ async def build_cites_clusters_query(
async def build_related_clusters_query(
cluster_search: Search,
sub_opinion_pks: list[str],
search_params: dict[str, str],
) -> Search:
"""Build the ES related clusters query based on sub-opinion IDs.

:param cluster_search: The Elasticsearch DSL Search object
:param sub_opinion_pks: A list of IDs representing sub-opinions to be queried.
:param search_params: A dict of parameters used to form the query.
:return: The ES DSL Search object representing the query to find the
related clusters.
"""
Expand Down Expand Up @@ -267,10 +265,11 @@ async def es_get_citing_and_related_clusters_with_cache(
related_index = citing_index = None
if cached_related_clusters is None:
related_query = await build_related_clusters_query(
cluster_search, sub_opinion_pks, search_params
cluster_search, sub_opinion_pks
)
related_query = related_query.extra(
size=settings.RELATED_COUNT, track_total_hits=False
size=settings.RELATED_COUNT,
track_total_hits=False,
)
mlissner marked this conversation as resolved.
Show resolved Hide resolved
multi_search = multi_search.add(related_query)
related_index = response_index
Expand Down
8 changes: 4 additions & 4 deletions cl/search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,10 @@
"syllabus",
]
SEARCH_MLT_OPINION_QUERY_FIELDS = [
"procedural_history",
"posture",
"syllabus",
"text",
"procedural_history.exact",
"posture.exact",
"syllabus.exact",
"text.exact",
]

# ES fields that are used for highlighting
Expand Down
4 changes: 4 additions & 0 deletions cl/search/tests/tests_es_opinion.py
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,7 @@ def test_uses_exact_version_for_case_name_field(self) -> None:
cluster_2.delete()


@override_settings(RELATED_MLT_MINTF=1)
class RelatedSearchTest(
ESIndexTestCase, CourtTestCase, PeopleTestCase, SearchTestCase, TestCase
):
Expand Down Expand Up @@ -2374,6 +2375,9 @@ def test_more_like_this_opinion(self) -> None:
< r.content.decode().index("/opinion/%i/" % expected_second_pk),
msg="'Howard v. Honda' should come AFTER 'case name cluster 3'.",
)
# Confirm that results contain a snippet
self.assertIn("<mark>plain</mark>", r.content.decode())

# Confirm "related to" cluster legend is within the results' header.
h2_element = html.fromstring(r.content.decode()).xpath(
'//h2[@id="result-count"]'
Expand Down