From 437110fade6838e75d7fd9438cbadc1155653218 Mon Sep 17 00:00:00 2001 From: William Palin Date: Wed, 6 Nov 2024 16:55:51 -0500 Subject: [PATCH 1/6] feat(recap.mergers): Update PACER attachment processing --- cl/recap/mergers.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 7eb9866e1d..c5b961d1ac 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1658,9 +1658,21 @@ async def merge_attachment_page_data( .afirst() ) else: - main_rd = await RECAPDocument.objects.select_related( - "docket_entry", "docket_entry__docket" - ).aget(**params) + try: + main_rd = await RECAPDocument.objects.select_related( + "docket_entry", "docket_entry__docket" + ).aget(**params) + except RECAPDocument.DoesNotExist as exc: + # In cases where we have "doppelgänger" dockets drop pacer + # case id and check if the docket exists once more. + if params.get("pacer_case_id"): + retry_params = params.copy() + retry_params.pop( + "docket_entry__docket__pacer_case_id", None + ) + main_rd = await RECAPDocument.objects.select_related( + "docket_entry", "docket_entry__docket" + ).aget(**retry_params) except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: duplicate_rd_queryset = RECAPDocument.objects.filter(**params) From a4fbacb61d148585c9bc0c384f95a48d3b37648b Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 7 Nov 2024 07:30:15 -0500 Subject: [PATCH 2/6] fix(recap.mergers): Reraise exception This should fix tests --- cl/recap/mergers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index c5b961d1ac..519a59de91 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1673,6 +1673,8 @@ async def merge_attachment_page_data( main_rd = await RECAPDocument.objects.select_related( "docket_entry", "docket_entry__docket" ).aget(**retry_params) + else: + raise exc except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: duplicate_rd_queryset = RECAPDocument.objects.filter(**params) From d32cf493b6ea775046bea363de2fa6c7bb653d61 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 7 Nov 2024 09:44:35 -0500 Subject: [PATCH 3/6] fix(recap.mergers): Use Correct key in params --- cl/recap/mergers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 519a59de91..578c883e02 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1665,7 +1665,7 @@ async def merge_attachment_page_data( except RECAPDocument.DoesNotExist as exc: # In cases where we have "doppelgänger" dockets drop pacer # case id and check if the docket exists once more. - if params.get("pacer_case_id"): + if params.get("docket_entry__docket__pacer_case_id"): retry_params = params.copy() retry_params.pop( "docket_entry__docket__pacer_case_id", None From e5321579117fb354f03249e07fa15874b691ee0e Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 3 Dec 2024 18:13:40 -0600 Subject: [PATCH 4/6] =?UTF-8?q?fix(recap):=20Added=20support=20for=20proce?= =?UTF-8?q?ssing=20attachment=20pages=20in=20doppelg=C3=A4nger=20cases?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cl/recap/mergers.py | 69 +++++++--- cl/recap/tasks.py | 81 ++++++++---- cl/recap/tests.py | 307 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 415 insertions(+), 42 deletions(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 578c883e02..7b9777a85d 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1605,6 +1605,54 @@ async def clean_duplicate_attachment_entries( await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() +async def look_for_doppelganger_rds( + court: Court, pq: ProcessingQueue, pacer_doc_id: int, text: str +) -> list[ProcessingQueue]: + """Identify and process potential RECAPDocuments with the same pacer_doc_id + in the court that likely belong to a doppelgänger case. + Return a list of ProcessingQueue instances for processing them. + + :param court: The court associated with the PACER document. + :param pq: The original processing queue object. + :param pacer_doc_id: The PACER document ID to match against. + :param text: The attachment page text. + :return: A list of ProcessingQueue objects to process. + """ + main_rds = ( + RECAPDocument.objects.select_related("docket_entry__docket") + .filter( + pacer_doc_id=pacer_doc_id, + docket_entry__docket__court=court, + ) + .order_by("docket_entry__docket__pacer_case_id") + .distinct("docket_entry__docket__pacer_case_id") + .only( + "pacer_doc_id", + "docket_entry__docket__pacer_case_id", + "docket_entry__docket__court_id", + ) + ) + pqs_to_process = [pq] # Add the original pq to the list of pqs to process + original_file_content = text.encode("utf-8") + original_file_name = pq.filepath_local.name + async for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + if main_pacer_case_id != pq.pacer_case_id: + # Create additional pqs for each doppelgänger case found. + pq_created = await ProcessingQueue.objects.acreate( + uploader_id=pq.uploader_id, + pacer_doc_id=pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=court.pk, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + original_file_content, name=original_file_name + ), + ) + pqs_to_process.append(pq_created) + return pqs_to_process + + async def merge_attachment_page_data( court: Court, pacer_case_id: int, @@ -1658,23 +1706,10 @@ async def merge_attachment_page_data( .afirst() ) else: - try: - main_rd = await RECAPDocument.objects.select_related( - "docket_entry", "docket_entry__docket" - ).aget(**params) - except RECAPDocument.DoesNotExist as exc: - # In cases where we have "doppelgänger" dockets drop pacer - # case id and check if the docket exists once more. - if params.get("docket_entry__docket__pacer_case_id"): - retry_params = params.copy() - retry_params.pop( - "docket_entry__docket__pacer_case_id", None - ) - main_rd = await RECAPDocument.objects.select_related( - "docket_entry", "docket_entry__docket" - ).aget(**retry_params) - else: - raise exc + main_rd = await RECAPDocument.objects.select_related( + "docket_entry", "docket_entry__docket" + ).aget(**params) + except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: duplicate_rd_queryset = RECAPDocument.objects.filter(**params) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 06377727a9..396314a25d 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -77,6 +77,7 @@ find_docket_object, get_data_from_appellate_att_report, get_data_from_att_report, + look_for_doppelganger_rds, merge_attachment_page_data, merge_pacer_docket_into_cl_docket, process_orphan_documents, @@ -115,7 +116,7 @@ async def process_recap_upload(pq: ProcessingQueue) -> None: docket = await process_recap_docket(pq.pk) await sync_to_async(add_or_update_recap_docket.delay)(docket) elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE: - await process_recap_attachment(pq.pk) + await look_for_doppelganger_rds_and_process_recap_attachment(pq.pk) elif pq.upload_type == UPLOAD_TYPE.PDF: await process_recap_pdf(pq.pk) elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: @@ -657,6 +658,55 @@ async def process_recap_docket(pk): } +async def get_att_data_from_pq( + pq: ProcessingQueue, +) -> tuple[ProcessingQueue | None, dict | None, str | None]: + """Extract attachment data from a ProcessingQueue object. + + :param pq: The ProcessingQueue object. + :return: A tuple containing the updated pq, att_data, and text. + """ + try: + with pq.filepath_local.open("r") as file: + text = file.read().decode() + except IOError as exc: + msg = f"Internal processing error ({exc.errno}: {exc.strerror})." + await mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) + return None, None, None + + att_data = get_data_from_att_report(text, pq.court_id) + if not att_data: + msg = "Not a valid attachment page upload." + await mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) + return None, None, None + + if pq.pacer_case_id in ["undefined", "null"]: + pq.pacer_case_id = att_data.get("pacer_case_id") + await pq.asave() + + return pq, att_data, text + + +async def look_for_doppelganger_rds_and_process_recap_attachment( + pk: int, +) -> None: + """Look for doppelgänger RECAPDocuments and process the corresponding + attachment page for each RECAPDocument. + + :param pk: Primary key of the processing queue item. + :return: None + """ + + pq = await ProcessingQueue.objects.aget(pk=pk) + court = await Court.objects.aget(id=pq.court_id) + pq, att_data, text = await get_att_data_from_pq(pq) + pqs_to_process = await look_for_doppelganger_rds( + court, pq, att_data["pacer_doc_id"], text + ) + for pq in pqs_to_process: + await process_recap_attachment(pq.pk) + + async def process_recap_attachment( pk: int, tag_names: Optional[List[str]] = None, @@ -664,7 +714,6 @@ async def process_recap_attachment( ) -> Optional[Tuple[int, str, list[RECAPDocument]]]: """Process an uploaded attachment page from the RECAP API endpoint. - :param self: The Celery task :param pk: The primary key of the processing queue item you want to work on :param tag_names: A list of tag names to add to all items created or modified in this function. @@ -678,30 +727,11 @@ async def process_recap_attachment( await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") - try: - text = pq.filepath_local.read().decode() - except IOError as exc: - msg = f"Internal processing error ({exc.errno}: {exc.strerror})." - pq_status, msg = await mark_pq_status( - pq, msg, PROCESSING_STATUS.FAILED - ) - return pq_status, msg, [] - - att_data = get_data_from_att_report(text, pq.court_id) - logger.info(f"Parsing completed for item {pq}") - - if att_data == {}: - # Bad attachment page. - msg = "Not a valid attachment page upload." - pq_status, msg = await mark_pq_status( - pq, msg, PROCESSING_STATUS.INVALID_CONTENT - ) - return pq_status, msg, [] + pq = await ProcessingQueue.objects.aget(pk=pk) + await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) + logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") - if pq.pacer_case_id in ["undefined", "null"]: - # Bad data from the client. Fix it with parsed data. - pq.pacer_case_id = att_data.get("pacer_case_id") - await pq.asave() + pq, att_data, text = await get_att_data_from_pq(pq) if document_number is None: document_number = att_data["document_number"] @@ -735,6 +765,7 @@ async def process_recap_attachment( await add_tags_to_objs(tag_names, rds_affected) await associate_related_instances(pq, d_id=de.docket_id, de_id=de.pk) pq_status, msg = await mark_pq_successful(pq) + return pq_status, msg, rds_affected diff --git a/cl/recap/tests.py b/cl/recap/tests.py index 0c19b4838f..099855883a 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -97,6 +97,7 @@ EmailProcessingQueue, FjcIntegratedDatabase, PacerFetchQueue, + PacerHtmlFiles, ProcessingQueue, ) from cl.recap.tasks import ( @@ -104,6 +105,7 @@ do_pacer_fetch, fetch_pacer_doc_by_rd, get_and_copy_recap_attachment_docs, + look_for_doppelganger_rds_and_process_recap_attachment, process_recap_acms_appellate_attachment, process_recap_acms_docket, process_recap_appellate_attachment, @@ -177,6 +179,28 @@ def setUpTestData(cls): ], ) + cls.att_data_2 = AppellateAttachmentPageFactory( + attachments=[ + AppellateAttachmentFactory( + pacer_doc_id="04505578698", attachment_number=1 + ), + AppellateAttachmentFactory( + pacer_doc_id="04505578699", attachment_number=2 + ), + ], + pacer_doc_id="04505578697", + pacer_case_id="104491", + document_number="1", + ) + cls.de_data_2 = DocketEntriesDataFactory( + docket_entries=[ + DocketEntryDataFactory( + pacer_doc_id="04505578697", + document_number=1, + ) + ], + ) + def setUp(self) -> None: self.async_client = AsyncAPIClient() self.user = User.objects.get(username="recap") @@ -766,6 +790,289 @@ def test_processing_an_acms_attachment_page(self, mock_upload): main_attachment[0].document_type, RECAPDocument.ATTACHMENT ) + def test_processing_doppelganger_case_attachment_page(self, mock_upload): + """Can we replicate an attachment page upload from a doppelgänger case + to its corresponding RD across all related dockets? + """ + + d_1 = DocketFactory( + source=Docket.RECAP, + docket_number="23-4567", + court=self.court, + pacer_case_id="104490", + ) + d_2 = DocketFactory( + source=Docket.RECAP, + docket_number="23-4567", + court=self.court, + pacer_case_id="104491", + ) + d_3 = DocketFactory( + source=Docket.RECAP, + docket_number="23-4567", + court=self.court, + pacer_case_id="104492", + ) + + # Add the docket entry to every case. + async_to_sync(add_docket_entries)( + d_1, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + d_2, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + d_3, self.de_data_2["docket_entries"] + ) + + # Create an initial PQ. + pq = ProcessingQueue.objects.create( + court=self.court, + uploader=self.user, + pacer_case_id="104491", + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=self.f, + ) + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=d_1 + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=d_2 + ) + d_3_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=d_3 + ) + + main_d_1_rd = d_1_recap_document[0] + main_d_2_rd = d_2_recap_document[0] + main_d_3_rd = d_2_recap_document[0] + + # After adding 1 docket entry, it should only exist its main RD on + # every docket + self.assertEqual(d_1_recap_document.count(), 1) + self.assertEqual(d_2_recap_document.count(), 1) + self.assertEqual(d_3_recap_document.count(), 1) + + self.assertEqual( + main_d_1_rd.document_type, RECAPDocument.PACER_DOCUMENT + ) + self.assertEqual( + main_d_2_rd.document_type, RECAPDocument.PACER_DOCUMENT + ) + self.assertEqual( + main_d_3_rd.document_type, RECAPDocument.PACER_DOCUMENT + ) + + with mock.patch( + "cl.recap.tasks.get_data_from_att_report", + side_effect=lambda x, y: self.att_data_2, + ): + # Process the attachment page containing 2 attachments. + async_to_sync( + look_for_doppelganger_rds_and_process_recap_attachment + )(pq.pk) + + # After adding attachments, it should exist 3 RD on every docket. + self.assertEqual( + d_1_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_2.pacer_case_id}.", + ) + self.assertEqual( + d_2_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_1.pacer_case_id}.", + ) + self.assertEqual( + d_3_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_3.pacer_case_id}.", + ) + + main_d_1_rd.refresh_from_db() + main_d_2_rd.refresh_from_db() + main_d_2_rd.refresh_from_db() + self.assertEqual( + main_d_1_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + self.assertEqual( + main_d_2_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + self.assertEqual( + main_d_3_rd.pacer_doc_id, + self.de_data_2["docket_entries"][0]["pacer_doc_id"], + ) + + # Two of them should be attachments. + d_1_attachments = RECAPDocument.objects.filter( + docket_entry__docket=d_1, document_type=RECAPDocument.ATTACHMENT + ) + d_2_attachments = RECAPDocument.objects.filter( + docket_entry__docket=d_2, document_type=RECAPDocument.ATTACHMENT + ) + d_3_attachments = RECAPDocument.objects.filter( + docket_entry__docket=d_3, document_type=RECAPDocument.ATTACHMENT + ) + + self.assertEqual( + d_1_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_1.pacer_case_id}.", + ) + self.assertEqual( + d_2_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_2.pacer_case_id}.", + ) + self.assertEqual( + d_3_attachments.count(), + 2, + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_3.pacer_case_id}.", + ) + + att_1_data = self.att_data_2["attachments"][0] + att_2_data = self.att_data_2["attachments"][0] + + self.assertEqual( + d_1_attachments.filter(pacer_doc_id=att_1_data["pacer_doc_id"]) + .first() + .attachment_number, + att_1_data["attachment_number"], + ) + self.assertEqual( + d_1_attachments.filter(pacer_doc_id=att_2_data["pacer_doc_id"]) + .first() + .attachment_number, + att_2_data["attachment_number"], + ) + self.assertEqual( + d_2_attachments.filter(pacer_doc_id=att_1_data["pacer_doc_id"]) + .first() + .attachment_number, + att_1_data["attachment_number"], + ) + self.assertEqual( + d_2_attachments.filter(pacer_doc_id=att_2_data["pacer_doc_id"]) + .first() + .attachment_number, + att_2_data["attachment_number"], + ) + + # Assert the number of PQs created to process the additional doppelgänger RDs. + pqs_created = ProcessingQueue.objects.all() + self.assertEqual(pqs_created.count(), 3) + + pqs_status = {pq.status for pq in pqs_created} + self.assertEqual(pqs_status, {PROCESSING_STATUS.SUCCESSFUL}) + + pqs_related_dockets = {pq.docket_id for pq in pqs_created} + self.assertEqual(pqs_related_dockets, {d_1.pk, d_2.pk, d_3.pk}) + + # 3 PacerHtmlFiles should have been created, one for each case. + att_html_created = PacerHtmlFiles.objects.all() + self.assertEqual(att_html_created.count(), 3) + related_htmls_de = { + html.content_object.pk for html in att_html_created + } + self.assertEqual( + {de.pk for de in DocketEntry.objects.all()}, related_htmls_de + ) + + def test_process_attachments_for_doppelganger_pq_with_missing_main_rd( + self, mock_upload + ): + """Confirm that if the RD related to the initial PQ is missing, + we can still process attachments for doppelgänger cases where the + main RD matches. + """ + + d_1 = DocketFactory( + source=Docket.RECAP, + docket_number="23-4567", + court=self.court, + pacer_case_id="104490", + ) + d_2 = DocketFactory( + source=Docket.RECAP, + docket_number="23-4567", + court=self.court, + pacer_case_id="104491", + ) + # Add the docket entry only to d_1. + async_to_sync(add_docket_entries)( + d_1, self.de_data_2["docket_entries"] + ) + + # Create an initial PQ related to d_1 + pq = ProcessingQueue.objects.create( + court=self.court, + uploader=self.user, + pacer_case_id="104491", + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=self.f, + ) + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=d_1 + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=d_2 + ) + + # After adding 1 docket entry d_1 + self.assertEqual( + d_1_recap_document.count(), + 1, + msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {d_1.pacer_case_id}", + ) + self.assertEqual( + d_2_recap_document.count(), + 0, + msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {d_2.pacer_case_id}", + ) + + with mock.patch( + "cl.recap.tasks.get_data_from_att_report", + side_effect=lambda x, y: self.att_data_2, + ): + # Process the attachment page containing 2 attachments. + async_to_sync( + look_for_doppelganger_rds_and_process_recap_attachment + )(pq.pk) + + # After adding attachments, it should exist 3 RD on every docket. + self.assertEqual( + d_1_recap_document.count(), + 3, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_2.pacer_case_id}.", + ) + self.assertEqual( + d_2_recap_document.count(), + 0, + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_1.pacer_case_id}.", + ) + + pq.refresh_from_db() + self.assertEqual( + pq.status, + PROCESSING_STATUS.FAILED, + msg="Didn't get the expected error message.", + ) + self.assertEqual( + pq.error_message, + "Could not find docket to associate with attachment metadata", + ) + + successful_pq = ProcessingQueue.objects.all().exclude(pk=pq.pk) + self.assertEqual(successful_pq.count(), 1) + self.assertEqual(successful_pq[0].status, PROCESSING_STATUS.SUCCESSFUL) + self.assertEqual( + successful_pq[0].docket_id, + d_1.pk, + msg="Didn't get the expected docket ID.", + ) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch( From 5ea2bd104247aa9f2d4b7b6d668230fcb82937fd Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 4 Dec 2024 08:54:18 -0600 Subject: [PATCH 5/6] fix(elasticsearch): Fixed encoding when opening the attachment file --- cl/recap/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index a2b6898cd0..edbebe3de7 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -667,8 +667,8 @@ async def get_att_data_from_pq( :return: A tuple containing the updated pq, att_data, and text. """ try: - with pq.filepath_local.open("r") as file: - text = file.read().decode() + with pq.filepath_local.open("rb") as file: + text = file.read().decode("utf-8") except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." await mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) From f722807dd5c3df4f1ba7d8f4bb634e6e97021748 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Fri, 13 Dec 2024 13:46:20 -0600 Subject: [PATCH 6/6] fix(recap): Refined approach for looking up and processing subdockets att pages --- cl/recap/mergers.py | 48 ---------------------------------------- cl/recap/tasks.py | 54 ++++++++++++++++++++++++++++++++++++--------- cl/recap/tests.py | 20 +++++++---------- 3 files changed, 51 insertions(+), 71 deletions(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 627ad9431c..0bbef5a5ec 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -1614,54 +1614,6 @@ async def clean_duplicate_attachment_entries( await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() -async def look_for_doppelganger_rds( - court: Court, pq: ProcessingQueue, pacer_doc_id: int, text: str -) -> list[ProcessingQueue]: - """Identify and process potential RECAPDocuments with the same pacer_doc_id - in the court that likely belong to a doppelgänger case. - Return a list of ProcessingQueue instances for processing them. - - :param court: The court associated with the PACER document. - :param pq: The original processing queue object. - :param pacer_doc_id: The PACER document ID to match against. - :param text: The attachment page text. - :return: A list of ProcessingQueue objects to process. - """ - main_rds = ( - RECAPDocument.objects.select_related("docket_entry__docket") - .filter( - pacer_doc_id=pacer_doc_id, - docket_entry__docket__court=court, - ) - .order_by("docket_entry__docket__pacer_case_id") - .distinct("docket_entry__docket__pacer_case_id") - .only( - "pacer_doc_id", - "docket_entry__docket__pacer_case_id", - "docket_entry__docket__court_id", - ) - ) - pqs_to_process = [pq] # Add the original pq to the list of pqs to process - original_file_content = text.encode("utf-8") - original_file_name = pq.filepath_local.name - async for main_rd in main_rds: - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - if main_pacer_case_id != pq.pacer_case_id: - # Create additional pqs for each doppelgänger case found. - pq_created = await ProcessingQueue.objects.acreate( - uploader_id=pq.uploader_id, - pacer_doc_id=pacer_doc_id, - pacer_case_id=main_pacer_case_id, - court_id=court.pk, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - original_file_content, name=original_file_name - ), - ) - pqs_to_process.append(pq_created) - return pqs_to_process - - async def merge_attachment_page_data( court: Court, pacer_case_id: int, diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 075de3f276..364fe5e49c 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -77,7 +77,6 @@ find_docket_object, get_data_from_appellate_att_report, get_data_from_att_report, - look_for_doppelganger_rds, merge_attachment_page_data, merge_pacer_docket_into_cl_docket, process_orphan_documents, @@ -111,7 +110,9 @@ async def process_recap_upload(pq: ProcessingQueue) -> None: if pq.upload_type == UPLOAD_TYPE.DOCKET: docket = await process_recap_docket(pq.pk) elif pq.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE: - await look_for_doppelganger_rds_and_process_recap_attachment(pq.pk) + sub_docket_att_page_pks = await find_subdocket_att_page_rds(pq.pk) + for pq_pk in sub_docket_att_page_pks: + await process_recap_attachment(pq_pk) elif pq.upload_type == UPLOAD_TYPE.PDF: await process_recap_pdf(pq.pk) elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: @@ -675,24 +676,55 @@ async def get_att_data_from_pq( return pq, att_data, text -async def look_for_doppelganger_rds_and_process_recap_attachment( +async def find_subdocket_att_page_rds( pk: int, -) -> None: - """Look for doppelgänger RECAPDocuments and process the corresponding - attachment page for each RECAPDocument. +) -> list[int]: + """Look for RECAP Documents that belong to subdockets, and create a PQ + object for each additional attachment page that requires processing. :param pk: Primary key of the processing queue item. - :return: None + :return: A list of ProcessingQueue pks to process. """ pq = await ProcessingQueue.objects.aget(pk=pk) court = await Court.objects.aget(id=pq.court_id) pq, att_data, text = await get_att_data_from_pq(pq) - pqs_to_process = await look_for_doppelganger_rds( - court, pq, att_data["pacer_doc_id"], text + pacer_doc_id = att_data["pacer_doc_id"] + main_rds = ( + RECAPDocument.objects.select_related("docket_entry__docket") + .filter( + pacer_doc_id=pacer_doc_id, + docket_entry__docket__court=court, + ) + .order_by("docket_entry__docket__pacer_case_id") + .distinct("docket_entry__docket__pacer_case_id") + .only( + "pacer_doc_id", + "docket_entry__docket__pacer_case_id", + "docket_entry__docket__court_id", + ) + .exclude(docket_entry__docket__pacer_case_id=pq.pacer_case_id) ) - for pq in pqs_to_process: - await process_recap_attachment(pq.pk) + pqs_to_process_pks = [ + pq.pk + ] # Add the original pq to the list of pqs to process + original_file_content = text.encode("utf-8") + original_file_name = pq.filepath_local.name + async for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pq_created = await ProcessingQueue.objects.acreate( + uploader_id=pq.uploader_id, + pacer_doc_id=pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=court.pk, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + original_file_content, name=original_file_name + ), + ) + pqs_to_process_pks.append(pq_created.pk) + return pqs_to_process_pks async def process_recap_attachment( diff --git a/cl/recap/tests.py b/cl/recap/tests.py index aa50779b9d..bce48790c8 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -108,7 +108,6 @@ do_pacer_fetch, fetch_pacer_doc_by_rd, get_and_copy_recap_attachment_docs, - look_for_doppelganger_rds_and_process_recap_attachment, process_recap_acms_appellate_attachment, process_recap_acms_docket, process_recap_appellate_attachment, @@ -117,6 +116,7 @@ process_recap_claims_register, process_recap_docket, process_recap_pdf, + process_recap_upload, process_recap_zip, ) from cl.recap_rss.tasks import merge_rss_feed_contents @@ -793,8 +793,8 @@ def test_processing_an_acms_attachment_page(self, mock_upload): main_attachment[0].document_type, RECAPDocument.ATTACHMENT ) - def test_processing_doppelganger_case_attachment_page(self, mock_upload): - """Can we replicate an attachment page upload from a doppelgänger case + def test_processing_subdocket_case_attachment_page(self, mock_upload): + """Can we replicate an attachment page upload from a subdocket case to its corresponding RD across all related dockets? """ @@ -871,9 +871,7 @@ def test_processing_doppelganger_case_attachment_page(self, mock_upload): side_effect=lambda x, y: self.att_data_2, ): # Process the attachment page containing 2 attachments. - async_to_sync( - look_for_doppelganger_rds_and_process_recap_attachment - )(pq.pk) + async_to_sync(process_recap_upload)(pq) # After adding attachments, it should exist 3 RD on every docket. self.assertEqual( @@ -963,7 +961,7 @@ def test_processing_doppelganger_case_attachment_page(self, mock_upload): att_2_data["attachment_number"], ) - # Assert the number of PQs created to process the additional doppelgänger RDs. + # Assert the number of PQs created to process the additional subdocket RDs. pqs_created = ProcessingQueue.objects.all() self.assertEqual(pqs_created.count(), 3) @@ -983,11 +981,11 @@ def test_processing_doppelganger_case_attachment_page(self, mock_upload): {de.pk for de in DocketEntry.objects.all()}, related_htmls_de ) - def test_process_attachments_for_doppelganger_pq_with_missing_main_rd( + def test_process_attachments_for_subdocket_pq_with_missing_main_rd( self, mock_upload ): """Confirm that if the RD related to the initial PQ is missing, - we can still process attachments for doppelgänger cases where the + we can still process attachments for subdocket cases where the main RD matches. """ @@ -1040,9 +1038,7 @@ def test_process_attachments_for_doppelganger_pq_with_missing_main_rd( side_effect=lambda x, y: self.att_data_2, ): # Process the attachment page containing 2 attachments. - async_to_sync( - look_for_doppelganger_rds_and_process_recap_attachment - )(pq.pk) + async_to_sync(process_recap_upload)(pq) # After adding attachments, it should exist 3 RD on every docket. self.assertEqual(