Skip to content

Commit

Permalink
refactor(search): Moves helper functions to search_utils.py
Browse files Browse the repository at this point in the history
This commit refactors the search module by moving helper functions from `view.py` to `search_utils.py`. This improves code organization and makes these helper functions reusable across different modules.
  • Loading branch information
ERosendo committed Jan 16, 2025
1 parent 8cda135 commit 93d31a2
Show file tree
Hide file tree
Showing 4 changed files with 373 additions and 367 deletions.
369 changes: 364 additions & 5 deletions cl/lib/search_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,54 @@
import pickle
import re
from typing import Any, Dict, List, Optional, Tuple, cast
from typing import Any, Dict, List, Optional, Tuple, TypedDict
from urllib.parse import parse_qs, urlencode

from asgiref.sync import sync_to_async
from django.core.paginator import Page
from asgiref.sync import async_to_sync, sync_to_async
from django.conf import settings
from django.core.cache import cache
from django.core.paginator import EmptyPage, Page, PageNotAnInteger
from django.http import HttpRequest
from django.http.request import QueryDict
from django_elasticsearch_dsl.search import Search
from eyecite.models import FullCaseCitation
from eyecite.tokenizers import HyperscanTokenizer

from cl.citations.match_citations_queries import es_get_query_citation
from cl.citations.utils import get_citation_depth_between_clusters
from cl.lib.types import SearchParam
from cl.search.forms import SearchForm
from cl.lib.crypto import sha256
from cl.lib.elasticsearch_utils import (
build_es_main_query,
compute_lowest_possible_estimate,
convert_str_date_fields_to_date_objects,
fetch_es_results,
get_facet_dict_for_search_query,
limit_inner_hits,
merge_courts_from_db,
merge_unavailable_fields_on_parent_document,
set_results_highlights,
simplify_estimated_count,
)
from cl.lib.paginators import ESPaginator
from cl.lib.types import CleanData
from cl.lib.utils import (
sanitize_unbalanced_parenthesis,
sanitize_unbalanced_quotes,
)
from cl.search.constants import RELATED_PATTERN
from cl.search.documents import (
AudioDocument,
DocketDocument,
OpinionClusterDocument,
ParentheticalGroupDocument,
PersonDocument,
)
from cl.search.exception import (
BadProximityQuery,
DisallowedWildcardPattern,
UnbalancedParenthesesQuery,
UnbalancedQuotesQuery,
)
from cl.search.forms import SearchForm, _clean_form
from cl.search.models import (
SEARCH_TYPES,
Court,
Expand Down Expand Up @@ -261,3 +300,323 @@ def store_search_api_query(
source=SearchQuery.API,
engine=engine,
)


class CachedESSearchResults(TypedDict):
results: Page | list
main_total: int | None
child_total: int | None


def retrieve_cached_search_results(
get_params: QueryDict,
) -> tuple[CachedESSearchResults | None, str]:
"""
Retrieve cached search results based on the GET parameters.
:param get_params: The GET parameters provided by the user.
:return: A two-tuple containing either the cached search results and the
cache key based ona prefix and the get parameters, or None and the cache key
if no cached results were found.
"""

params = get_params.copy()
# If no page is present in the parameters, set it to 1 to generate the same
# hash for page 1, regardless of whether the page parameter is included.
# Apply the same to the q parameter when it is not present in params.
params.setdefault("page", "1")
params.setdefault("q", "")
sorted_params = dict(sorted(params.items()))
key_prefix = "search_results_cache:"
params_hash = sha256(pickle.dumps(sorted_params))
cache_key = f"{key_prefix}{params_hash}"
cached_results = cache.get(cache_key)
if cached_results:
return pickle.loads(cached_results), cache_key
return None, cache_key


def fetch_and_paginate_results(
get_params: QueryDict,
search_query: Search,
child_docs_count_query: Search | None,
rows_per_page: int = settings.SEARCH_PAGE_SIZE,
cache_key: str | None = None,
) -> tuple[Page | list, int, bool, int | None, int | None]:
"""Fetch and paginate elasticsearch results.
:param get_params: The user get params.
:param search_query: Elasticsearch DSL Search object
:param child_docs_count_query: The ES DSL Query to perform the count for
child documents if required, otherwise None.
:param rows_per_page: Number of records wanted per page
:param cache_key: The cache key to use.
:return: A five-tuple: the paginated results, the ES query time, whether
there was an error, the total number of hits for the main document, and
the total number of hits for the child document.
"""

# Run the query and set up pagination
if cache_key is not None:
# Check cache for displaying insights on the Home Page.
results = cache.get(cache_key)
if results is not None:
return results, 0, False, None, None

# Check micro-cache for all other search requests.
results_dict, micro_cache_key = retrieve_cached_search_results(get_params)
if results_dict:
# Return results and counts. Set query time to 1ms.
return (
results_dict["results"],
1,
False,
results_dict["main_total"],
results_dict["child_total"],
)

try:
page = int(get_params.get("page", 1))
except ValueError:
page = 1

# Check pagination depth
check_pagination_depth(page)

# Fetch results from ES
hits, query_time, error, main_total, child_total = fetch_es_results(
get_params, search_query, child_docs_count_query, page, rows_per_page
)

if error:
return [], query_time, error, main_total, child_total
paginator = ESPaginator(main_total, hits, rows_per_page)
try:
results = paginator.page(page)
except PageNotAnInteger:
results = paginator.page(1)
except EmptyPage:
results = paginator.page(paginator.num_pages)

search_type = get_params.get("type", SEARCH_TYPES.OPINION)
# Set highlights in results.
convert_str_date_fields_to_date_objects(results, search_type)
merge_courts_from_db(results, search_type)
limit_inner_hits(get_params, results, search_type)
set_results_highlights(results, search_type)
merge_unavailable_fields_on_parent_document(results, search_type)

if cache_key is not None:
# Cache only Page results for displaying insights on the Home Page.
cache.set(cache_key, results, settings.QUERY_RESULTS_CACHE)
elif settings.ELASTICSEARCH_MICRO_CACHE_ENABLED:
# Cache Page results and counts for all other search requests.
results_dict = {
"results": results,
"main_total": main_total,
"child_total": child_total,
}
serialized_data = pickle.dumps(results_dict)
cache.set(
micro_cache_key,
serialized_data,
settings.SEARCH_RESULTS_MICRO_CACHE,
)

return results, query_time, error, main_total, child_total


def remove_missing_citations(
missing_citations: list[FullCaseCitation], cd: CleanData
) -> tuple[list[str], str]:
"""Removes missing citations from the query and returns the missing
citations as strings and the modified query.
:param missing_citations: A list of FullCaseCitation objects representing
the citations that are missing from the query.
:param cd: A CleanData object containing the query string.
:return: A two-tuple containing a list of missing citation strings and the
suggested query string with missing citations removed.
"""
missing_citations_str = [
citation.corrected_citation() for citation in missing_citations
]
query_string = cd["q"]
for citation in missing_citations_str:
query_string = query_string.replace(citation, "")
suggested_query = (
" ".join(query_string.split()) if missing_citations_str else ""
)
return missing_citations_str, suggested_query


def do_es_search(
get_params: QueryDict,
rows: int = settings.SEARCH_PAGE_SIZE,
facet: bool = True,
cache_key: str | None = None,
):
"""Run Elasticsearch searching and filtering and prepare data to display
:param get_params: The request.GET params sent by user.
:param rows: The number of Elasticsearch results to request
:param facet: Whether to complete faceting in the query
:param cache_key: A cache key with which to save the results. Note that it
does not do anything clever with the actual query, so if you use this, your
cache key should *already* have factored in the query. If None, no caching
is set or used. Results are saved for six hours.
:return: A big dict of variables for use in the search results, homepage, or
other location.
"""
paged_results = None
courts = Court.objects.filter(in_use=True)
query_time: int | None = 0
total_query_results: int | None = 0
top_hits_limit: int | None = 5
document_type = None
error_message = ""
suggested_query = ""
total_child_results: int | None = 0
related_cluster = None
cited_cluster = None
query_citation = None
facet_fields = []
missing_citations_str: list[str] = []
error = True

search_form = SearchForm(get_params, courts=courts)
match get_params.get("type", SEARCH_TYPES.OPINION):
case SEARCH_TYPES.PARENTHETICAL:
document_type = ParentheticalGroupDocument
case SEARCH_TYPES.ORAL_ARGUMENT:
document_type = AudioDocument
case SEARCH_TYPES.PEOPLE:
document_type = PersonDocument
case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS:
document_type = DocketDocument
# Set a different number of results per page for RECAP SEARCH
rows = settings.RECAP_SEARCH_PAGE_SIZE
case SEARCH_TYPES.OPINION:
document_type = OpinionClusterDocument

if search_form.is_valid() and document_type:
# Copy cleaned_data to preserve the original data when displaying the form
cd = search_form.cleaned_data.copy()
try:
# Create necessary filters to execute ES query
search_query = document_type.search()

if cd["type"] in [
SEARCH_TYPES.OPINION,
SEARCH_TYPES.RECAP,
SEARCH_TYPES.DOCKETS,
]:
query_citation, missing_citations = es_get_query_citation(cd)
if cd["type"] in [
SEARCH_TYPES.OPINION,
]:
missing_citations_str, suggested_query = (
remove_missing_citations(missing_citations, cd)
)
cd["q"] = suggested_query if suggested_query else cd["q"]
(
s,
child_docs_count_query,
top_hits_limit,
) = build_es_main_query(search_query, cd)
(
paged_results,
query_time,
error,
total_query_results,
total_child_results,
) = fetch_and_paginate_results(
get_params,
s,
child_docs_count_query,
rows_per_page=rows,
cache_key=cache_key,
)
cited_cluster = async_to_sync(add_depth_counts)(
# Also returns cited cluster if found
search_data=cd,
search_results=paged_results,
)
related_prefix = RELATED_PATTERN.search(cd["q"])
if related_prefix:
related_pks = related_prefix.group("pks").split(",")
related_cluster = OpinionCluster.objects.filter(
sub_opinions__pk__in=related_pks
).distinct("pk")
except UnbalancedParenthesesQuery as e:
error = True
error_message = "unbalanced_parentheses"
if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING:
suggested_query = sanitize_unbalanced_parenthesis(
cd.get("q", "")
)
except UnbalancedQuotesQuery as e:
error = True
error_message = "unbalanced_quotes"
if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING:
suggested_query = sanitize_unbalanced_quotes(cd.get("q", ""))
except BadProximityQuery as e:
error = True
error_message = "bad_proximity_token"
suggested_query = "proximity_filter"
if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING:
suggested_query = "proximity_query"
except DisallowedWildcardPattern:
error = True
error_message = "disallowed_wildcard_pattern"
finally:
# Make sure to always call the _clean_form method
search_form = _clean_form(
get_params, search_form.cleaned_data, courts
)
if cd["type"] in [SEARCH_TYPES.OPINION] and facet:
# If the search query is valid, pass the cleaned data to filter and
# retrieve the correct number of opinions per status. Otherwise (if
# the query has errors), just provide a dictionary containing the
# search type to get the total number of opinions per status
facet_fields = get_facet_dict_for_search_query(
search_query,
cd if not error else {"type": cd["type"]},
search_form,
)

courts, court_count_human, court_count = merge_form_with_courts(
courts, search_form
)
search_summary_str = search_form.as_text(court_count_human)
search_summary_dict = search_form.as_display_dict(court_count_human)
results_details = [
query_time,
total_query_results,
top_hits_limit,
total_child_results,
]

return {
"results": paged_results,
"results_details": results_details,
"search_form": search_form,
"search_summary_str": search_summary_str,
"search_summary_dict": search_summary_dict,
"error": error,
"courts": courts,
"court_count_human": court_count_human,
"court_count": court_count,
"query_citation": query_citation,
"cited_cluster": cited_cluster,
"related_cluster": related_cluster,
"facet_fields": facet_fields,
"error_message": error_message,
"suggested_query": suggested_query,
"estimated_count_threshold": simplify_estimated_count(
compute_lowest_possible_estimate(
settings.ELASTICSEARCH_CARDINALITY_PRECISION
)
),
"missing_citations": missing_citations_str,
}
Loading

0 comments on commit 93d31a2

Please sign in to comment.