From c2810f9bbd0834f6a62ab1504a6cbae649f2afa3 Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Wed, 6 Mar 2024 17:24:30 -0500 Subject: [PATCH] feat(cl_scrape_opinions): support nested objects scraper Main difference is that the nested objects supports OpinionClusters, which may have multiple Opinions. Current data flow is based on a single opinion per item - abstracted a few functions to share code with new nested object approach --- .../management/commands/cl_scrape_opinions.py | 350 +++++++++++++----- cl/scrapers/utils.py | 69 +++- 2 files changed, 328 insertions(+), 91 deletions(-) diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py index 75ea05441a..9919ced525 100644 --- a/cl/scrapers/management/commands/cl_scrape_opinions.py +++ b/cl/scrapers/management/commands/cl_scrape_opinions.py @@ -38,6 +38,7 @@ Docket, Opinion, OpinionCluster, + OriginatingCourtInformation, ) # for use in catching the SIGINT (Ctrl+4) @@ -45,6 +46,75 @@ cnt = CaseNameTweaker() +def check_duplicated_content( + download_url: str, + site, + court: Court, + precedential_status: str, + current_date: date, + next_date: date | None, + dup_checker: DupChecker, +) -> Tuple[bytes, str, bool]: + """Downloads opinion's content and checks duplication via hash + + :param download_url: opinion's content URL + :param site: a juriscraper scraper object + :param court: a court object, used to decide duplication lookup query + :param precedential_status: used to decide duplication lookup query + :param current_date: used by dup checker + :param next_date: used by dup checker + + :return: opinion's raw content, sha1 hash + and `proceed` flag to continue parsing the record or skip it + """ + court_str = court.id + # Minnesota currently rejects Courtlistener and Juriscraper as a User Agent + if court_str in ["minn", "minnctapp"]: + headers = site.headers + else: + headers = {"User-Agent": "CourtListener"} + + msg, r = get_binary_content( + download_url, + site, + headers, + method=site.method, + ) + if msg: + logger.warning(msg) + ErrorLog(log_level="WARNING", court=court, message=msg).save() + return b"", "", False + + content = site.cleanup_content(r.content) + + # request.content is sometimes a str, sometimes unicode, so + # force it all to be bytes, pleasing hashlib. + sha1_hash = sha1(force_bytes(content)) + if ( + court_str == "nev" and precedential_status == "Unpublished" + ) or court_str in ["neb"]: + # Nevada's non-precedential cases have different SHA1 sums + # every time. + + # Nebraska updates the pdf causing the SHA1 to not match + # the opinions in CL causing duplicates. See CL issue #1452 + + lookup_params = { + "lookup_value": download_url, + "lookup_by": "download_url", + } + else: + lookup_params = { + "lookup_value": sha1_hash, + "lookup_by": "sha1", + } + + proceed = dup_checker.press_on( + Opinion, current_date, next_date, **lookup_params + ) + return content, sha1_hash, proceed + + def make_citation( cite_str: str, cluster: OpinionCluster, court_id: str ) -> Optional[Citation]: @@ -72,6 +142,22 @@ def make_citation( ) +def save_file_content( + opinion: Opinion, cluster: OpinionCluster, content: bytes +) -> None: + """Saves Opinion's file content and stores reference on Opinion object + + :param opinion: the opinion + :param cluster: opinion's parent cluster + :param content: file content + """ + cf = ContentFile(content) + extension = get_extension(content) + file_name = trunc(cluster.case_name.lower(), 75) + extension + opinion.file_with_date = cluster.date_filed + opinion.local_path.save(file_name, cf, save=False) + + @transaction.atomic def make_objects( item: Dict[str, Union[str, Any]], @@ -133,16 +219,88 @@ def make_objects( sha1=sha1_hash, download_url=url, ) - - cf = ContentFile(content) - extension = get_extension(content) - file_name = trunc(item["case_names"].lower(), 75) + extension - opinion.file_with_date = cluster.date_filed - opinion.local_path.save(file_name, cf, save=False) + save_file_content(opinion, cluster, content) return docket, opinion, cluster, citations +@transaction.atomic +def make_validated_objects( + docket_json: Dict[str, Union[str, Any]], + contents: List[Tuple[bytes, str]], + court: Court | str, +) -> Dict[ + str, + Union[ + Docket, + OpinionCluster, + List[Opinion], + List[Citation], + OriginatingCourtInformation, + ], +]: + """Takes the meta data from the scraper and associates it with objects. + :param docket_json: nested object scraped by scraper + :param contents: opinion's file contents and hashes + :param court: court string or object + + :return: dictionary of instantiated objects + """ + items = {} + + # Unpack object + d = docket_json["Docket"] + oc = d.pop("OpinionCluster") + op_json = oc.pop("Opinions") + citation_strings = oc.pop("citation_strings", []) + citations_json = oc.pop("Citations", []) + oci = d.pop("OriginatingCourtInformation", {}) + + if oci: + items["originating_court_information"] = OriginatingCourtInformation( + **oci + ) + + # Docket + d["court_id"] = court.pk if isinstance(court, Court) else court + docket = update_or_create_docket(**d) + + # OpinionCluster + cluster = OpinionCluster(**oc) + + # Citations + citations = [] + if citation_strings: + for cite in citation_strings: + if not cite: + continue + cite_obj = make_citation(cite, cluster, court.id) + if cite_obj: + citations.append(cite_obj) + + for cite_json in citations_json: + citations.append(Citation(**cite_json)) + + # Opinions + opinions = [] + for opinion_json, (content, sha1_hash) in zip(op_json, contents): + url = opinion_json["download_url"] if court.id != "tax" else "" + opinion_json.update({"download_url": url, "sha1": sha1_hash}) + opinion = Opinion(**opinion_json) + save_file_content(opinion, cluster, content) + opinions.append(opinion) + + items.update( + { + "docket": docket, + "opinion": opinions, + "cluster": cluster, + "citations": citations, + } + ) + return items + + @transaction.atomic def save_everything( items: Dict[str, Any], @@ -151,7 +309,16 @@ def save_everything( ) -> None: """Saves all the sub items and associates them as appropriate.""" docket, cluster = items["docket"], items["cluster"] - opinion, citations = items["opinion"], items["citations"] + opinions, citations = items["opinion"], items["citations"] + + oci = items.get("originating_court_information") + if oci: + docket.originating_court_information = oci + oci.save() + + if not isinstance(opinions, list): + opinions = [opinions] + docket.save() cluster.docket = docket cluster.save(index=False) # Index only when the opinion is associated. @@ -165,18 +332,21 @@ def save_everything( cluster.judges, docket.court.pk, cluster.date_filed ) if len(candidate_judges) == 1: - opinion.author = candidate_judges[0] + for opinion in opinions: + if not opinion.author: + opinion.author = candidate_judges[0] if len(candidate_judges) > 1: for candidate in candidate_judges: cluster.panel.add(candidate) - opinion.cluster = cluster - opinion.save(index=index) - if not backscrape: - RealTimeQueue.objects.create( - item_type=SEARCH_TYPES.OPINION, item_pk=opinion.pk - ) + for opinion in opinions: + opinion.cluster = cluster + opinion.save(index=index) + if not backscrape: + RealTimeQueue.objects.create( + item_type=SEARCH_TYPES.OPINION, item_pk=opinion.pk + ) class Command(VerboseCommand): @@ -244,97 +414,97 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True): logger.info(f"Using cookies: {site.cookies}") logger.debug(f"#{len(site)} opinions found.") added = 0 - for i, item in enumerate(site): - # Minnesota currently rejects Courtlistener and Juriscraper as a User Agent - if court_str in ["minn", "minnctapp"]: - headers = site.headers - else: - headers = {"User-Agent": "CourtListener"} - - msg, r = get_binary_content( - item["download_urls"], - site, - headers, - method=site.method, - ) - if msg: - logger.warning(msg) - ErrorLog(log_level="WARNING", court=court, message=msg).save() - continue - content = site.cleanup_content(r.content) + is_cluster_site = getattr(site, "is_cluster_site", False) - current_date = item["case_dates"] - try: - next_date = site[i + 1]["case_dates"] - except IndexError: - next_date = None - - # request.content is sometimes a str, sometimes unicode, so - # force it all to be bytes, pleasing hashlib. - sha1_hash = sha1(force_bytes(content)) - if ( - court_str == "nev" - and item["precedential_statuses"] == "Unpublished" - ) or court_str in ["neb"]: - # Nevada's non-precedential cases have different SHA1 sums - # every time. - - # Nebraska updates the pdf causing the SHA1 to not match - # the opinions in CL causing duplicates. See CL issue #1452 - - lookup_params = { - "lookup_value": item["download_urls"], - "lookup_by": "download_url", - } + for i, item in enumerate(site): + if is_cluster_site: + oc = item["Docket"]["OpinionCluster"] + try: + next_oc = site[i + 1]["Docket"]["OpinionCluster"] + next_date = next_oc["date_filed"] + except IndexError: + next_date = None + download_urls = [op["download_url"] for op in oc["Opinions"]] + current_date = oc["date_filed"] + case_name = oc["case_name"].encode() + precedential_status = oc["precedential_status"] else: - lookup_params = { - "lookup_value": sha1_hash, - "lookup_by": "sha1", - } + download_urls = [item["download_urls"]] + current_date = item["case_dates"] + precedential_status = item["precedential_statuses"] + case_name = item["case_names"].encode() + try: + next_date = site[i + 1]["case_dates"] + except IndexError: + next_date = None + + opinion_contents = [] + for download_url in download_urls: + content, sha1_hash, proceed = check_duplicated_content( + download_url, + site, + court, + precedential_status, + current_date, + next_date, + dup_checker, + ) + opinion_contents.append((content, sha1_hash)) - proceed = dup_checker.press_on( - Opinion, current_date, next_date, **lookup_params - ) - if dup_checker.emulate_break: - logger.debug("Emulate break triggered.") - break - if not proceed: - logger.debug("Skipping opinion.") - continue + if dup_checker.emulate_break: + logger.debug("Emulate break triggered.") + break + if not proceed: + logger.debug("Skipping opinion.") + continue + # Not a duplicate, carry on + logger.info( + "Adding new document found at: %s", download_url.encode() + ) + dup_checker.reset() - # Not a duplicate, carry on - logger.info( - f"Adding new document found at: {item['download_urls'].encode()}" - ) - dup_checker.reset() + if not opinion_contents: + # When all opinions in a cluster have already been downloaded + continue child_court = get_child_court( item.get("child_courts", ""), court.id ) - docket, opinion, cluster, citations = make_objects( - item, child_court or court, sha1_hash, content - ) - - save_everything( - items={ + if is_cluster_site: + items = make_validated_objects( + item, opinion_contents, child_court or court + ) + else: + # OpinionSite and OpinionSiteLinear scrapers support + # a single opinion per scraped item + docket, opinion, cluster, citations = make_objects( + item, child_court or court, sha1_hash, content + ) + items = { "docket": docket, "opinion": opinion, "cluster": cluster, "citations": citations, - }, - index=False, - ) - extract_doc_content.delay( - opinion.pk, ocr_available=ocr_available, citation_jitter=True - ) + } - logger.info( - f"Successfully added opinion {opinion.pk}: " - f"{item['case_names'].encode()}" - ) - added += 1 + save_everything(items=items, index=False) + + opinions = items.get("opinion") + for opinion in ( + opinions if isinstance(opinions, list) else [opinions] + ): + extract_doc_content.delay( + opinion.pk, + ocr_available=ocr_available, + citation_jitter=True, + ) + logger.info( + "Successfully added opinion %s: %s", opinion.pk, case_name + ) + + added += len(opinion_contents) # Update the hash if everything finishes properly. logger.debug( diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index acd876b758..88a9eccc25 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -290,21 +290,56 @@ def update_or_create_docket( date_blocked: date | None = None, date_argued: date | None = None, ia_needs_upload: bool | None = None, + appeal_from_str: str = "", + appeal_from_id: str | None = None, + assigned_to_str: str = "", + referred_to_str: str = "", + panel_str: str = "", + cause: str = "", + nature_of_suit: str = "", + jury_demand: str = "", + appellate_fee_status: str = "", + date_cert_granted: date | None = None, + date_cert_denied: date | None = None, + date_reargued: date | None = None, + date_reargument_denied: date | None = None, + date_filed: date | None = None, + date_terminated: date | None = None, + date_last_filing: date | None = None, ) -> Docket: """Look for an existing Docket and update it or create a new one if it's not found. + Required arguments: :param case_name: The docket case_name. :param case_name_short: The docket case_name_short :param court_id: The court id the docket belongs to. :param docket_number: The docket number. :param source: The docket source. :param blocked: If the docket should be blocked, default False. + + Optional args: :param case_name_full: The docket case_name_full. :param date_blocked: The docket date_blocked if it's blocked. :param date_argued: The docket date_argued if it's an oral argument. :param ia_needs_upload: If the docket needs upload to IA, default None. - :return: The docket docket. + :param appeal_from_str: + :param assigned_to_str: + :param referred_to_str: + :param panel_str: + :param cause: + :param nature_of_suit: + :param jury_demand: + :param appellate_fee_status: + :param date_cert_granted: + :param date_cert_denied: + :param date_reargued: + :param date_reargument_denied: + :param date_filed: + :param date_terminated: + :param date_last_filing: + + :return: The docket """ docket = async_to_sync(find_docket_object)(court_id, None, docket_number) if docket.pk: @@ -316,6 +351,22 @@ def update_or_create_docket( docket.date_blocked = date_blocked docket.date_argued = date_argued docket.ia_needs_upload = ia_needs_upload + docket.appeal_from_str = appeal_from_str + docket.appeal_from_id = appeal_from_id + docket.assigned_to_str = assigned_to_str + docket.referred_to_str = referred_to_str + docket.panel_str = panel_str + docket.cause = cause + docket.nature_of_suit = nature_of_suit + docket.jury_demand = jury_demand + docket.appellate_fee_status = appellate_fee_status + docket.date_cert_granted = date_cert_granted + docket.date_cert_denied = date_cert_denied + docket.date_reargued = date_reargued + docket.date_reargument_denied = date_reargument_denied + docket.date_filed = date_filed + docket.date_terminated = date_terminated + docket.date_last_filing = date_last_filing else: docket = Docket( case_name=case_name, @@ -328,5 +379,21 @@ def update_or_create_docket( date_blocked=date_blocked, date_argued=date_argued, ia_needs_upload=ia_needs_upload, + appeal_from_str=appeal_from_str, + appeal_from_id=appeal_from_id, + assigned_to_str=assigned_to_str, + referred_to_str=referred_to_str, + panel_str=panel_str, + cause=cause, + nature_of_suit=nature_of_suit, + jury_demand=jury_demand, + appellate_fee_status=appellate_fee_status, + date_cert_granted=date_cert_granted, + date_cert_denied=date_cert_denied, + date_reargued=date_reargued, + date_reargument_denied=date_reargument_denied, + date_filed=date_filed, + date_terminated=date_terminated, + date_last_filing=date_last_filing, ) return docket