Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

4438 Improved MLT and "Cited by" queries on the Opinion page. #4446

Merged
merged 7 commits into from
Sep 19, 2024
164 changes: 51 additions & 113 deletions cl/lib/elasticsearch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,14 @@
from functools import reduce, wraps
from typing import Any, Callable, Dict, List, Literal

from asgiref.sync import sync_to_async
from asgiref.sync import async_to_sync
from django.conf import settings
from django.core.cache import caches
from django.core.paginator import EmptyPage, Page
from django.db.models import Case, CharField
from django.core.paginator import Page
from django.db.models import Case
from django.db.models import Q as QObject
from django.db.models import QuerySet, TextField, Value, When
from django.db.models import QuerySet, TextField, When
from django.db.models.functions import Substr
from django.forms.boundfield import BoundField
from django.http import HttpRequest
from django.http.request import QueryDict
from django.utils.html import strip_tags
from django_elasticsearch_dsl.search import Search
Expand All @@ -31,9 +29,7 @@

from cl.audio.models import Audio
from cl.custom_filters.templatetags.text_filters import html_decode
from cl.lib.bot_detector import is_bot
from cl.lib.date_time import midnight_pt
from cl.lib.paginators import ESPaginator
from cl.lib.string_utils import trunc
from cl.lib.types import (
ApiPositionMapping,
Expand All @@ -59,6 +55,7 @@
RELATED_PATTERN,
SEARCH_ALERTS_ORAL_ARGUMENT_ES_HL_FIELDS,
SEARCH_HL_TAG,
SEARCH_MLT_OPINION_QUERY_FIELDS,
SEARCH_OPINION_HL_FIELDS,
SEARCH_OPINION_QUERY_FIELDS,
SEARCH_ORAL_ARGUMENT_ES_HL_FIELDS,
Expand All @@ -81,7 +78,6 @@
)
from cl.search.forms import SearchForm
from cl.search.models import (
PRECEDENTIAL_STATUS,
SEARCH_TYPES,
Court,
Opinion,
Expand Down Expand Up @@ -169,24 +165,33 @@ def build_daterange_query(
return []


def build_more_like_this_query(related_id: list[str]):
document_list = [{"_id": f"o_{id}"} for id in related_id]
more_like_this_fields = SEARCH_OPINION_QUERY_FIELDS.copy()
more_like_this_fields.extend(
[
"type",
"text",
"caseName",
"docketNumber",
]
)
return Q(
async def build_more_like_this_query(related_ids: list[str]) -> Query:
"""Build an ES "more like this" query based on related Opinion IDs.

:param related_ids: A list of related Opinion IDs to build the query on.
:return: An ES query object with "more like this" query and
exclusions for specific opinion clusters.
"""

document_list = [{"_id": f"o_{id}"} for id in related_ids]
more_like_this_fields = SEARCH_MLT_OPINION_QUERY_FIELDS.copy()
mlt_query = Q(
"more_like_this",
fields=more_like_this_fields,
like=document_list,
min_term_freq=1,
max_query_terms=12,
)
# Exclude opinion clusters to which the related IDs to query belong.
cluster_ids_to_exclude = (
OpinionCluster.objects.filter(sub_opinions__pk__in=related_ids)
.distinct("pk")
.values_list("pk", flat=True)
)
cluster_ids_list = [pk async for pk in cluster_ids_to_exclude.aiterator()]
exclude_cluster_ids = [Q("terms", cluster_id=cluster_ids_list)]
bool_query = Q("bool", must=[mlt_query], must_not=exclude_cluster_ids)
return bool_query


def make_es_boost_list(fields: Dict[str, float]) -> list[str]:
Expand Down Expand Up @@ -1177,7 +1182,19 @@ def build_es_base_query(
mlt_query = None
if related_match:
cluster_pks = related_match.group("pks").split(",")
mlt_query = build_more_like_this_query(cluster_pks)
mlt_query = async_to_sync(build_more_like_this_query)(
cluster_pks
)
main_query, join_query = build_full_join_es_queries(
cd,
{"opinion": []},
[],
mlt_query,
child_highlighting=False,
api_version=api_version,
)
return search_query.query(main_query), join_query

opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS
child_fields = opinion_search_fields.copy()
child_fields.extend(
Expand Down Expand Up @@ -1441,6 +1458,12 @@ def add_es_highlighting(
:param highlighting: Whether highlighting should be enabled in docs.
:return: The modified Elasticsearch search query object with highlights set
"""

# Avoid highlighting for the related cluster query.
related_match = RELATED_PATTERN.search(cd.get("q", ""))
if related_match:
return search_query

highlighting_fields = {}
highlighting_keyword_fields = []
hl_tag = ALERTS_HL_TAG if alerts else SEARCH_HL_TAG
Expand Down Expand Up @@ -2035,7 +2058,6 @@ def fetch_es_results(
main_doc_count_query = build_cardinality_count(
main_doc_count_query, parent_unique_field
)

if child_docs_count_query:
child_unique_field = cardinality_query_unique_ids[
SEARCH_TYPES.RECAP_DOCUMENT
Expand Down Expand Up @@ -2461,12 +2483,13 @@ def build_full_join_es_queries(
child_filters_original = deepcopy(child_filters)
# Build child text query.
child_fields = child_query_fields[child_type]
child_text_query = build_fulltext_query(
child_fields, cd.get("q", ""), only_queries=True
)

if mlt_query:
child_text_query.append(mlt_query)
child_text_query = [mlt_query]
else:
child_text_query = build_fulltext_query(
child_fields, cd.get("q", ""), only_queries=True
)

# Build parent filters.
parent_filters = build_join_es_filters(cd)
Expand Down Expand Up @@ -2602,7 +2625,7 @@ def build_full_join_es_queries(
should=string_query,
minimum_should_match=1,
)
if parent_query:
if parent_query and not mlt_query:
q_should.append(parent_query)

if not q_should:
Expand Down Expand Up @@ -2758,91 +2781,6 @@ def merge_opinion_and_cluster(results: Page | dict) -> None:
result["status_exact"] = result["status"]


async def get_related_clusters_with_cache_and_es(
search: Search,
cluster: OpinionCluster,
request: HttpRequest,
) -> tuple[Page | list, list[int], dict[str, str]]:
"""Retrieve related opinion clusters from ES or cache.

:param search: The ES Search object.
:param cluster: The current OpinionCluster.
:param request: The HttpRequest object.
:return: A three tuple containing a Page containing opinion clusters or an
empty list. A list containing the cluster sub opinions ids. A dic containing
the url_search_params.
"""

# By default, all statuses are included. Retrieve the PRECEDENTIAL_STATUS
# attributes (since they're indexed in ES) instead of the NAMES values.
available_statuses = [status[0] for status in PRECEDENTIAL_STATUS.NAMES]
url_search_params = {f"stat_{v}": "on" for v in available_statuses}
search_params: CleanData = {}
# Opinions that belong to the targeted cluster
sub_opinion_ids = cluster.sub_opinions.values_list("pk", flat=True)
sub_opinion_pks = [pk async for pk in sub_opinion_ids]
if is_bot(request) or not sub_opinion_pks:
# If it is a bot or lacks sub-opinion IDs, return empty results
return [], [], url_search_params

# Use cache if enabled
cache = caches["db_cache"]
mlt_cache_key = f"mlt-cluster-es:{cluster.pk}"
related_clusters = (
await cache.aget(mlt_cache_key) if settings.RELATED_USE_CACHE else None
)

if settings.RELATED_FILTER_BY_STATUS:
# Filter results by status (e.g., Precedential)
# Update URL parameters accordingly
search_params[
f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}"
] = True
url_search_params = {
f"stat_{PRECEDENTIAL_STATUS.get_status_value(settings.RELATED_FILTER_BY_STATUS)}": "on"
}

if related_clusters is None:
sub_opinion_queries = ",".join(str(pk) for pk in sub_opinion_pks)
search_params["q"] = f"related:{sub_opinion_queries}"
search_params["type"] = SEARCH_TYPES.OPINION
query_dict = QueryDict("", mutable=True)
query_dict.update(search_params)
search_query, child_docs_count_query, _ = await sync_to_async(
build_es_main_query
)(search, search_params)
hits, _, error, total_query_results, _ = await sync_to_async(
fetch_es_results
)(
query_dict,
search_query,
child_docs_count_query,
1,
settings.RELATED_COUNT,
)
if error:
return [], [], url_search_params

@sync_to_async
def paginate_related_clusters(total_results: int, results: Response):
paginator = ESPaginator(
total_results, results, settings.RELATED_COUNT
)
try:
return paginator.page(1)
except EmptyPage:
return paginator.page(paginator.num_pages)

related_clusters = await paginate_related_clusters(
total_query_results, hits
)

await cache.aset(
mlt_cache_key, related_clusters, settings.RELATED_CACHE_TIMEOUT
)
return related_clusters, sub_opinion_pks, url_search_params


def make_es_stats_variable(
search_form: SearchForm,
results: Page | Response,
Expand Down
30 changes: 9 additions & 21 deletions cl/opinion_page/templates/includes/opinions_sidebar.html
Original file line number Diff line number Diff line change
@@ -1,27 +1,15 @@
{% load text_filters %}
{% load waffle_tags %}
<ul>
{% flag "o-es-active" %}
{% for opinion in opinions.object_list %}
<li>
<a href="{{ opinion.absolute_url }}{% querystring %}">
{% with opinion.title as title %}
{{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
{% endwith %}
</a>
</li>
{% endfor %}
{% else %}
{% for opinion in opinions %}
<li>
<a href="{{ opinion.absolute_url }}{% querystring %}">
{% with opinion.title as title %}
{{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
{% endwith %}
</a>
</li>
{% endfor %}
{% endflag %}
{% for opinion in opinions %}
<li>
<a href="{{ opinion.absolute_url }}{% querystring %}">
{% with opinion.title as title %}
{{ opinion.caseName|default:title|default_if_none:"N/A"|safe|truncatewords:10|v_wrapper }}
{% endwith %}
</a>
</li>
{% endfor %}
</ul>
<p>
<a href="{{ full_list_url }}" class="btn btn-default">
Expand Down
21 changes: 15 additions & 6 deletions cl/opinion_page/templates/opinion.html
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,17 @@ <h3>
>View Citing Opinions</a>
</p>
{% else %}
<p>This case has not yet been cited in our system.</p>
{% if queries_timeout %}
<p>Unable to retrieve citing clusters. Please try by clicking the button below:</p>
<p>
<a href="/?q=cites%3A({{ cluster.sub_opinions.all|OR_join }})"
rel="nofollow"
class="btn btn-default"
>View Citing Opinions</a>
</p>
{% else %}
<p>This case has not yet been cited in our system.</p>
{% endif %}
{% endif %}
<div class="btn-group">
<a href="/?show_alert_modal=yes&q=cites%3A({{ cluster.sub_opinions.all|OR_join }})"
Expand All @@ -159,14 +169,13 @@ <h3>


{# Related opinions #}
{% if related_clusters %}
{% if related_clusters or queries_timeout %}
<div id="recommendations" class="sidebar-section">
<h3><span>Related Case Law</span></h3>

<p class="bottom">The following case law covers similar topics:</p>

{% if not related_clusters and queries_timeout %}
<p class="bottom">Unable to retrieve related clusters. Please try by clicking the button below:</p>
{% endif %}
{% url 'show_results' as show_results_url %}

{% with sub_opinion_ids_list=sub_opinion_ids|join:',' pk_str=cluster.pk|stringformat:"s" %}
{% with opinions=related_clusters full_list_url=show_results_url|add:"?q=related:"|add:sub_opinion_ids_list|add:related_search_params %}

Expand Down
16 changes: 10 additions & 6 deletions cl/opinion_page/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from django.contrib.auth.models import Group, User
from django.core.files.uploadedfile import SimpleUploadedFile
from django.core.management import call_command
from django.test import RequestFactory, override_settings
from django.test import AsyncRequestFactory, RequestFactory, override_settings
from django.test.client import AsyncClient
from django.urls import reverse
from django.utils.text import slugify
Expand All @@ -38,7 +38,7 @@
TennWorkCompClUploadForm,
)
from cl.opinion_page.utils import (
es_get_citing_clusters_with_cache,
es_get_citing_and_related_clusters_with_cache,
generate_docket_entries_csv_data,
make_docket_title,
)
Expand Down Expand Up @@ -191,13 +191,17 @@ async def test_simple_opinion_page(self) -> None:
self.assertIn("33 state 1", response.content.decode())

async def test_es_get_citing_clusters_with_cache(self) -> None:
"""Does es_get_citing_clusters_with_cache return the correct clusters
citing and the total cites count?
"""Does es_get_citing_and_related_clusters_with_cache return the
correct clusters citing and the total cites count?
"""

clusters, count = await es_get_citing_clusters_with_cache(
self.o_cluster_3
request = AsyncRequestFactory().get("/")
result = await es_get_citing_and_related_clusters_with_cache(
self.o_cluster_3, request
)
clusters = result.citing_clusters
count = result.citing_cluster_count

c_list_names = [c["caseName"] for c in clusters]
expected_clusters = [
self.o_cluster_1.case_name,
Expand Down
Loading
Loading