freelawproject · flooie · Jan 16, 2025 · Jan 13, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -18,8 +18,8 @@
 from cl.scrapers.exceptions import BadContentError
 from cl.scrapers.management.commands import cl_back_scrape_opinions
 from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
-from cl.scrapers.utils import get_binary_content
-from cl.search.models import Citation, Court, Opinion
+from cl.scrapers.utils import citation_is_duplicated, get_binary_content
+from cl.search.models import Court, Opinion
 
 
 class Command(cl_back_scrape_opinions.Command):
@@ -92,7 +92,7 @@ def scrape_court(
                 if not citation_candidate:
                     continue
 
-                if self.citation_is_duplicated(citation_candidate, cite):
+                if citation_is_duplicated(citation_candidate, cite):
                     continue
 
                 try:
@@ -106,46 +106,3 @@ def scrape_court(
                         cite,
                         cluster,
                     )
-
-    def citation_is_duplicated(
-        self, citation_candidate: Citation, cite: str
-    ) -> bool:
-        """Checks if the citation is duplicated for the cluster
-
-        Following corpus_importer.utils.add_citations_to_cluster we
-        identify 2 types of duplication:
-        - exact: a citation with the same fields already exists for the cluster
-        - duplication in the same reporter: the cluster already has a citation
-            in that reporter
-
-        :param citation_candidate: the citation object
-        :param cite: citation string
-
-        :return: True if citation is duplicated, False if not
-        """
-        citation_params = {**citation_candidate.__dict__}
-        citation_params.pop("_state", "")
-        citation_params.pop("id", "")
-        cluster_id = citation_candidate.cluster.id
-
-        # Exact duplication
-        if Citation.objects.filter(**citation_params).exists():
-            logger.info(
-                "Citation '%s' already exists for cluster %s",
-                cite,
-                cluster_id,
-            )
-            return True
-
-        # Duplication in the same reporter
-        if Citation.objects.filter(
-            cluster_id=cluster_id, reporter=citation_candidate.reporter
-        ).exists():
-            logger.info(
-                "Another citation in the same reporter '%s' exists for cluster %s",
-                citation_candidate.reporter,
-                cluster_id,
-            )
-            return True
-
-        return False
diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -3,21 +3,18 @@
 import time
 import traceback
 from datetime import date
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Tuple, Union
 
 from asgiref.sync import async_to_sync
 from django.core.files.base import ContentFile
 from django.core.management.base import CommandError
 from django.db import transaction
 from django.utils.encoding import force_bytes
-from eyecite.find import get_citations
-from eyecite.tokenizers import HyperscanTokenizer
 from juriscraper.lib.importer import build_module_list
 from juriscraper.lib.string_utils import CaseNameTweaker
 from sentry_sdk import capture_exception
 
 from cl.alerts.models import RealTimeQueue
-from cl.citations.utils import map_reporter_db_cite_type
 from cl.lib.command_utils import ScraperCommand, logger
 from cl.lib.crypto import sha1
 from cl.lib.string_utils import trunc
@@ -33,6 +30,7 @@
     get_binary_content,
     get_child_court,
     get_extension,
+    make_citation,
     save_response,
     signal_handler,
     update_or_create_docket,
@@ -47,40 +45,11 @@
     OpinionCluster,
 )
 
-HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")
-
 # for use in catching the SIGINT (Ctrl+4)
 die_now = False
 cnt = CaseNameTweaker()
 
 
-def make_citation(
-    cite_str: str, cluster: OpinionCluster, court_id: str
-) -> Optional[Citation]:
-    """Create and return a citation object for the input values."""
-    citation_objs = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER)
-    if not citation_objs:
-        logger.error(
-            "Could not parse citation from court '%s'",
-            court_id,
-            extra=dict(
-                cite=cite_str,
-                cluster=cluster,
-                fingerprint=[f"{court_id}-no-citation-found"],
-            ),
-        )
-        return None
-    # Convert the found cite type to a valid cite type for our DB.
-    cite_type_str = citation_objs[0].all_editions[0].reporter.cite_type
-    return Citation(
-        cluster=cluster,
-        volume=citation_objs[0].groups["volume"],
-        reporter=citation_objs[0].corrected_reporter(),
-        page=citation_objs[0].groups["page"],
-        type=map_reporter_db_cite_type(cite_type_str),
-    )
-
-
 @transaction.atomic
 def make_objects(
     item: Dict[str, Union[str, Any]],

diff --git a/cl/scrapers/management/commands/update_from_text.py b/cl/scrapers/management/commands/update_from_text.py
@@ -4,7 +4,7 @@
 from django.db import transaction
 
 from cl.lib.command_utils import ScraperCommand, logger
-from cl.scrapers.tasks import update_document_from_text
+from cl.scrapers.tasks import extract_doc_content, update_document_from_text
 from cl.search.models import (
     PRECEDENTIAL_STATUS,
     SOURCES,
@@ -33,10 +33,17 @@ def rerun_extract_from_text(
         # May be an opinion entirely from a merged corpus
         # or an error during text extraction
         logger.info(
-            "Opinion %s has no `plain_text` or `html` to extract from",
+            "Opinion %s has no `plain_text` or `html`"
+            "to extract from. Executing extraction",
             opinion.id,
         )
         stats["No text to extract from"] += 1
+        extract_doc_content(
+            pk=opinion.pk,
+            ocr_available=True,
+            citation_jitter=True,
+            juriscraper_module=juriscraper_module,
+        )
         return
 
     with transaction.atomic():
@@ -83,16 +90,11 @@ def rerun_extract_from_text(
             logger.debug("Opinion updated with data %s", changes["Opinion"])
             stats["Opinion"] += 1
 
-        if changes.get("Citation"):
-            if changes["Citation"].get("citation_created"):
-                logger.info(
-                    "Citation created with data %s", changes["Citation"]
-                )
-                stats["Citation"] += 1
-            else:
-                logger.debug(
-                    "Citation not created. Data %s", changes["Citation"]
-                )
+        if changes.get("citation_created"):
+            logger.info("Citation created with data %s", changes["Citation"])
+            stats["Citation"] += 1
+        elif changes.get("Citation"):
+            logger.debug("Citation not created. Data %s", changes["Citation"])
 
 
 class Command(ScraperCommand):

diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py
@@ -6,7 +6,6 @@
 import httpx
 import requests
 from asgiref.sync import async_to_sync
-from django.apps import apps
 from django.conf import settings
 from django.core.files.base import ContentFile
 from httpx import Response
@@ -30,7 +29,7 @@
 from cl.lib.string_utils import trunc
 from cl.lib.utils import is_iter
 from cl.recap.mergers import save_iquery_to_docket
-from cl.scrapers.utils import scraped_citation_object_is_valid
+from cl.scrapers.utils import citation_is_duplicated, make_citation
 from cl.search.models import Docket, Opinion, RECAPDocument
 
 logger = logging.getLogger(__name__)
@@ -50,38 +49,43 @@ def update_document_from_text(
     text. Formerly implemented in only Tax Court, but functional in all
     scrapers via AbstractSite object.
 
-    Note that this updates the values but does not save them. Saving is left to
-    the calling function.
+    Note that this updates the values but does not save them for
+    Docket, OpinionCluster and Opinion. Saving is left to
+    the calling function. It does save Citations
 
     :param opinion: Opinion object
     :param juriscraper_module: full module to get Site object
     :return: the extracted data dictionary
     """
-    court = opinion.cluster.docket.court.pk
-    site = get_scraper_object_by_name(court, juriscraper_module)
+    court_id = opinion.cluster.docket.court.pk
+    site = get_scraper_object_by_name(court_id, juriscraper_module)
     if site is None:
         logger.debug("No site found %s", juriscraper_module)
         return {}
 
+    citation_created = False
     metadata_dict = site.extract_from_text(opinion.plain_text or opinion.html)
     for model_name, data in metadata_dict.items():
-        ModelClass = apps.get_model(f"search.{model_name}")
         if model_name == "Docket":
             opinion.cluster.docket.__dict__.update(data)
         elif model_name == "OpinionCluster":
             opinion.cluster.__dict__.update(data)
         elif model_name == "Citation":
-            data["cluster_id"] = opinion.cluster_id
-            if scraped_citation_object_is_valid(data):
-                _, citation_created = ModelClass.objects.get_or_create(**data)
-                metadata_dict["Citation"]["created"] = citation_created
+            citation = make_citation(data, opinion.cluster, court_id)
+            if not citation or citation_is_duplicated(citation, data):
+                continue
+            citation.save()
+            citation_created = True
         elif model_name == "Opinion":
             opinion.__dict__.update(data)
         else:
             raise NotImplementedError(
                 f"Object type of {model_name} not yet supported."
             )
 
+    # if the candidate citation was saved successfully, it will have an id
+    metadata_dict["citation_created"] = citation_created
+
     return metadata_dict
 
 

diff --git a/cl/scrapers/test_assets/test_opinion_scraper.py b/cl/scrapers/test_assets/test_opinion_scraper.py
@@ -59,7 +59,7 @@ def extract_from_text(self, scraped_text):
         metadata = {}
         docket_regex = r"Docket Number: (?P<docket>\d+-\d+)"
         disposition_regex = r"Disposition: (?P<disposition>\w+)"
-        citation_regex = r"(?P<volume>20\d{2}) (?P<reporter>VT) (?P<page>\d+)"
+        citation_regex = r"20\d{2} VT \d+"
         if docket_match := re.search(docket_regex, scraped_text):
             metadata["Docket"] = {
                 "docket_number": docket_match.group("docket")
@@ -71,6 +71,6 @@ def extract_from_text(self, scraped_text):
             }
 
         if citation_match := re.search(citation_regex, scraped_text):
-            metadata["Citation"] = {**citation_match.groupdict(), "type": 8}
+            metadata["Citation"] = citation_match.group(0)
 
         return metadata
diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
@@ -40,7 +40,6 @@
     get_binary_content,
     get_existing_docket,
     get_extension,
-    scraped_citation_object_is_valid,
     update_or_create_docket,
 )
 from cl.search.factories import (
@@ -994,26 +993,6 @@ def test_inputs(self):
             "Unpublished docket should not be modified",
         )
 
-    def test_scraped_citation_object_is_valid(self):
-        """Can we validate Citation dicts got from `Site.extract_from_text`"""
-        bad_type = {"reporter": "WI", "type": Citation.FEDERAL}
-        self.assertFalse(
-            scraped_citation_object_is_valid(bad_type),
-            "Citation should be marked as invalid. Type does not match reporter",
-        )
-
-        bad_reporter = {"reporter": "Some text"}
-        self.assertFalse(
-            scraped_citation_object_is_valid(bad_reporter),
-            "Citation should be marked as invalid. Reporter does not exist",
-        )
-
-        valid_citation = {"reporter": "WI", "type": Citation.NEUTRAL}
-        self.assertTrue(
-            scraped_citation_object_is_valid(valid_citation),
-            "Citation object should be marked as valid",
-        )
-
 
 class CommandInputTest(TestCase):
     def test_get_module_by_court_id(self):

diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
@@ -11,6 +11,8 @@
 from django.conf import settings
 from django.core.files.base import ContentFile
 from django.db.models import Q
+from eyecite.find import get_citations
+from eyecite.tokenizers import HyperscanTokenizer
 from juriscraper import AbstractSite
 from juriscraper.AbstractSite import logger
 from juriscraper.lib.test_utils import MockRequest
@@ -29,7 +31,78 @@
     NoDownloadUrlError,
     UnexpectedContentTypeError,
 )
-from cl.search.models import Court, Docket
+from cl.search.models import Citation, Court, Docket, OpinionCluster
+
+HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan")
+
+
+def make_citation(
+    cite_str: str, cluster: OpinionCluster, court_id: str
+) -> Optional[Citation]:
+    """Create and return a citation object for the input values."""
+    citation_objs = get_citations(cite_str, tokenizer=HYPERSCAN_TOKENIZER)
+    if not citation_objs:
+        logger.error(
+            "Could not parse citation from court '%s'",
+            court_id,
+            extra=dict(
+                cite=cite_str,
+                cluster=cluster,
+                fingerprint=[f"{court_id}-no-citation-found"],
+            ),
+        )
+        return None
+    # Convert the found cite type to a valid cite type for our DB.
+    cite_type_str = citation_objs[0].all_editions[0].reporter.cite_type
+    return Citation(
+        cluster=cluster,
+        volume=citation_objs[0].groups["volume"],
+        reporter=citation_objs[0].corrected_reporter(),
+        page=citation_objs[0].groups["page"],
+        type=map_reporter_db_cite_type(cite_type_str),
+    )
+
+
+def citation_is_duplicated(citation_candidate: Citation, cite: str) -> bool:
+    """Checks if the citation is duplicated for the cluster
+
+    Following corpus_importer.utils.add_citations_to_cluster we
+    identify 2 types of duplication:
+    - exact: a citation with the same fields already exists for the cluster
+    - duplication in the same reporter: the cluster already has a citation
+        in that reporter
+
+    :param citation_candidate: the citation object
+    :param cite: citation string
+
+    :return: True if citation is duplicated, False if not
+    """
+    citation_params = {**citation_candidate.__dict__}
+    citation_params.pop("_state", "")
+    citation_params.pop("id", "")
+    cluster_id = citation_candidate.cluster.id
+
+    # Exact duplication
+    if Citation.objects.filter(**citation_params).exists():
+        logger.info(
+            "Citation '%s' already exists for cluster %s",
+            cite,
+            cluster_id,
+        )
+        return True
+
+    # Duplication in the same reporter
+    if Citation.objects.filter(
+        cluster_id=cluster_id, reporter=citation_candidate.reporter
+    ).exists():
+        logger.info(
+            "Another citation in the same reporter '%s' exists for cluster %s",
+            citation_candidate.reporter,
+            cluster_id,
+        )
+        return True
+
+    return False
 
 
 def get_child_court(child_court_name: str, court_id: str) -> Optional[Court]: