Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(scrapers.tasks.update_from_text): reuse make_citation in update_from_text #4913

Merged
merged 12 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 3 additions & 46 deletions cl/scrapers/management/commands/cl_back_scrape_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
from cl.scrapers.exceptions import BadContentError
from cl.scrapers.management.commands import cl_back_scrape_opinions
from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
from cl.scrapers.utils import get_binary_content
from cl.search.models import Citation, Court, Opinion
from cl.scrapers.utils import citation_is_duplicated, get_binary_content
from cl.search.models import Court, Opinion


class Command(cl_back_scrape_opinions.Command):
Expand Down Expand Up @@ -92,7 +92,7 @@ def scrape_court(
if not citation_candidate:
continue

if self.citation_is_duplicated(citation_candidate, cite):
if citation_is_duplicated(citation_candidate, cite):
continue

try:
Expand All @@ -106,46 +106,3 @@ def scrape_court(
cite,
cluster,
)

def citation_is_duplicated(
self, citation_candidate: Citation, cite: str
) -> bool:
"""Checks if the citation is duplicated for the cluster

Following corpus_importer.utils.add_citations_to_cluster we
identify 2 types of duplication:
- exact: a citation with the same fields already exists for the cluster
- duplication in the same reporter: the cluster already has a citation
in that reporter

:param citation_candidate: the citation object
:param cite: citation string

:return: True if citation is duplicated, False if not
"""
citation_params = {**citation_candidate.__dict__}
citation_params.pop("_state", "")
citation_params.pop("id", "")
cluster_id = citation_candidate.cluster.id

# Exact duplication
if Citation.objects.filter(**citation_params).exists():
logger.info(
"Citation '%s' already exists for cluster %s",
cite,
cluster_id,
)
return True

# Duplication in the same reporter
if Citation.objects.filter(
cluster_id=cluster_id, reporter=citation_candidate.reporter
).exists():
logger.info(
"Another citation in the same reporter '%s' exists for cluster %s",
citation_candidate.reporter,
cluster_id,
)
return True

return False
35 changes: 2 additions & 33 deletions cl/scrapers/management/commands/cl_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,18 @@
import time
import traceback
from datetime import date
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Any, Dict, List, Tuple, Union

from asgiref.sync import async_to_sync
from django.core.files.base import ContentFile
from django.core.management.base import CommandError
from django.db import transaction
from django.utils.encoding import force_bytes
from eyecite.find import get_citations
from eyecite.tokenizers import HyperscanTokenizer
from juriscraper.lib.importer import build_module_list
from juriscraper.lib.string_utils import CaseNameTweaker
from sentry_sdk import capture_exception

from cl.alerts.models import RealTimeQueue
from cl.citations.utils import map_reporter_db_cite_type
from cl.lib.command_utils import ScraperCommand, logger
from cl.lib.crypto import sha1
from cl.lib.string_utils import trunc
Expand All @@ -33,6 +30,7 @@
get_binary_content,
get_child_court,
get_extension,
make_citation,
save_response,
signal_handler,
update_or_create_docket,
Expand All @@ -47,40 +45,11 @@
OpinionCluster,
)

HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")

# for use in catching the SIGINT (Ctrl+4)
die_now = False
cnt = CaseNameTweaker()


def make_citation(
cite_str: str, cluster: OpinionCluster, court_id: str
) -> Optional[Citation]:
"""Create and return a citation object for the input values."""
citation_objs = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER)
if not citation_objs:
logger.error(
"Could not parse citation from court '%s'",
court_id,
extra=dict(
cite=cite_str,
cluster=cluster,
fingerprint=[f"{court_id}-no-citation-found"],
),
)
return None
# Convert the found cite type to a valid cite type for our DB.
cite_type_str = citation_objs[0].all_editions[0].reporter.cite_type
return Citation(
cluster=cluster,
volume=citation_objs[0].groups["volume"],
reporter=citation_objs[0].corrected_reporter(),
page=citation_objs[0].groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)


@transaction.atomic
def make_objects(
item: Dict[str, Union[str, Any]],
Expand Down
26 changes: 14 additions & 12 deletions cl/scrapers/management/commands/update_from_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from django.db import transaction

from cl.lib.command_utils import ScraperCommand, logger
from cl.scrapers.tasks import update_document_from_text
from cl.scrapers.tasks import extract_doc_content, update_document_from_text
from cl.search.models import (
PRECEDENTIAL_STATUS,
SOURCES,
Expand Down Expand Up @@ -33,10 +33,17 @@ def rerun_extract_from_text(
# May be an opinion entirely from a merged corpus
# or an error during text extraction
logger.info(
"Opinion %s has no `plain_text` or `html` to extract from",
"Opinion %s has no `plain_text` or `html`"
"to extract from. Executing extraction",
opinion.id,
)
stats["No text to extract from"] += 1
extract_doc_content(
pk=opinion.pk,
ocr_available=True,
citation_jitter=True,
juriscraper_module=juriscraper_module,
)
return

with transaction.atomic():
Expand Down Expand Up @@ -83,16 +90,11 @@ def rerun_extract_from_text(
logger.debug("Opinion updated with data %s", changes["Opinion"])
stats["Opinion"] += 1

if changes.get("Citation"):
if changes["Citation"].get("citation_created"):
logger.info(
"Citation created with data %s", changes["Citation"]
)
stats["Citation"] += 1
else:
logger.debug(
"Citation not created. Data %s", changes["Citation"]
)
if changes.get("citation_created"):
logger.info("Citation created with data %s", changes["Citation"])
stats["Citation"] += 1
elif changes.get("Citation"):
logger.debug("Citation not created. Data %s", changes["Citation"])


class Command(ScraperCommand):
Expand Down
26 changes: 15 additions & 11 deletions cl/scrapers/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import httpx
import requests
from asgiref.sync import async_to_sync
from django.apps import apps
from django.conf import settings
from django.core.files.base import ContentFile
from httpx import Response
Expand All @@ -30,7 +29,7 @@
from cl.lib.string_utils import trunc
from cl.lib.utils import is_iter
from cl.recap.mergers import save_iquery_to_docket
from cl.scrapers.utils import scraped_citation_object_is_valid
from cl.scrapers.utils import citation_is_duplicated, make_citation
from cl.search.models import Docket, Opinion, RECAPDocument

logger = logging.getLogger(__name__)
Expand All @@ -50,38 +49,43 @@ def update_document_from_text(
text. Formerly implemented in only Tax Court, but functional in all
scrapers via AbstractSite object.

Note that this updates the values but does not save them. Saving is left to
the calling function.
Note that this updates the values but does not save them for
Docket, OpinionCluster and Opinion. Saving is left to
the calling function. It does save Citations

:param opinion: Opinion object
:param juriscraper_module: full module to get Site object
:return: the extracted data dictionary
"""
court = opinion.cluster.docket.court.pk
site = get_scraper_object_by_name(court, juriscraper_module)
court_id = opinion.cluster.docket.court.pk
site = get_scraper_object_by_name(court_id, juriscraper_module)
if site is None:
logger.debug("No site found %s", juriscraper_module)
return {}

citation_created = False
metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html)
for model_name, data in metadata_dict.items():
ModelClass = apps.get_model(f"search.{model_name}")
if model_name == "Docket":
opinion.cluster.docket.__dict__.update(data)
elif model_name == "OpinionCluster":
opinion.cluster.__dict__.update(data)
elif model_name == "Citation":
data["cluster_id"] = opinion.cluster_id
if scraped_citation_object_is_valid(data):
_, citation_created = ModelClass.objects.get_or_create(**data)
metadata_dict["Citation"]["created"] = citation_created
citation = make_citation(data, opinion.cluster, court_id)
if not citation or citation_is_duplicated(citation, data):
continue
Comment on lines +75 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make more sense to throw the error logging here - and not pass court id into make citation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's awkward to pass the court_id only for logging, but if we didn't do it inside this function we would have to make an outside logger.error call everytime make_citation returns None

citation.save()
citation_created = True
elif model_name == "Opinion":
opinion.__dict__.update(data)
else:
raise NotImplementedError(
f"Object type of {model_name} not yet supported."
)

# if the candidate citation was saved successfully, it will have an id
metadata_dict["citation_created"] = citation_created

return metadata_dict


Expand Down
4 changes: 2 additions & 2 deletions cl/scrapers/test_assets/test_opinion_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def extract_from_text(self, scraped_text):
metadata = {}
docket_regex = r"Docket Number: (?P<docket>\d+-\d+)"
disposition_regex = r"Disposition: (?P<disposition>\w+)"
citation_regex = r"(?P<volume>20\d{2}) (?P<reporter>VT) (?P<page>\d+)"
citation_regex = r"20\d{2} VT \d+"
if docket_match := re.search(docket_regex, scraped_text):
metadata["Docket"] = {
"docket_number": docket_match.group("docket")
Expand All @@ -71,6 +71,6 @@ def extract_from_text(self, scraped_text):
}

if citation_match := re.search(citation_regex, scraped_text):
metadata["Citation"] = {**citation_match.groupdict(), "type": 8}
metadata["Citation"] = citation_match.group(0)

return metadata
21 changes: 0 additions & 21 deletions cl/scrapers/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
get_binary_content,
get_existing_docket,
get_extension,
scraped_citation_object_is_valid,
update_or_create_docket,
)
from cl.search.factories import (
Expand Down Expand Up @@ -994,26 +993,6 @@ def test_inputs(self):
"Unpublished docket should not be modified",
)

def test_scraped_citation_object_is_valid(self):
"""Can we validate Citation dicts got from `Site.extract_from_text`"""
bad_type = {"reporter": "WI", "type": Citation.FEDERAL}
self.assertFalse(
scraped_citation_object_is_valid(bad_type),
"Citation should be marked as invalid. Type does not match reporter",
)

bad_reporter = {"reporter": "Some text"}
self.assertFalse(
scraped_citation_object_is_valid(bad_reporter),
"Citation should be marked as invalid. Reporter does not exist",
)

valid_citation = {"reporter": "WI", "type": Citation.NEUTRAL}
self.assertTrue(
scraped_citation_object_is_valid(valid_citation),
"Citation object should be marked as valid",
)


class CommandInputTest(TestCase):
def test_get_module_by_court_id(self):
Expand Down
75 changes: 74 additions & 1 deletion cl/scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from django.conf import settings
from django.core.files.base import ContentFile
from django.db.models import Q
from eyecite.find import get_citations
from eyecite.tokenizers import HyperscanTokenizer
from juriscraper import AbstractSite
from juriscraper.AbstractSite import logger
from juriscraper.lib.test_utils import MockRequest
Expand All @@ -29,7 +31,78 @@
NoDownloadUrlError,
UnexpectedContentTypeError,
)
from cl.search.models import Court, Docket
from cl.search.models import Citation, Court, Docket, OpinionCluster

HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")


def make_citation(
cite_str: str, cluster: OpinionCluster, court_id: str
) -> Optional[Citation]:
"""Create and return a citation object for the input values."""
citation_objs = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER)
if not citation_objs:
logger.error(
"Could not parse citation from court '%s'",
court_id,
extra=dict(
cite=cite_str,
cluster=cluster,
fingerprint=[f"{court_id}-no-citation-found"],
),
)
return None
# Convert the found cite type to a valid cite type for our DB.
cite_type_str = citation_objs[0].all_editions[0].reporter.cite_type
return Citation(
cluster=cluster,
volume=citation_objs[0].groups["volume"],
reporter=citation_objs[0].corrected_reporter(),
page=citation_objs[0].groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)


def citation_is_duplicated(citation_candidate: Citation, cite: str) -> bool:
"""Checks if the citation is duplicated for the cluster

Following corpus_importer.utils.add_citations_to_cluster we
identify 2 types of duplication:
- exact: a citation with the same fields already exists for the cluster
- duplication in the same reporter: the cluster already has a citation
in that reporter

:param citation_candidate: the citation object
:param cite: citation string

:return: True if citation is duplicated, False if not
"""
citation_params = {**citation_candidate.__dict__}
citation_params.pop("_state", "")
citation_params.pop("id", "")
cluster_id = citation_candidate.cluster.id

# Exact duplication
if Citation.objects.filter(**citation_params).exists():
logger.info(
"Citation '%s' already exists for cluster %s",
cite,
cluster_id,
)
return True

# Duplication in the same reporter
if Citation.objects.filter(
cluster_id=cluster_id, reporter=citation_candidate.reporter
).exists():
logger.info(
"Another citation in the same reporter '%s' exists for cluster %s",
citation_candidate.reporter,
cluster_id,
)
return True

return False


def get_child_court(child_court_name: str, court_id: str) -> Optional[Court]:
Expand Down
Loading
Loading