Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

4826 Replicate RECAP PDF uploads to subdockets #4857

Merged
merged 6 commits into from
Jan 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 12 additions & 22 deletions cl/recap/api_serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,10 @@ def validate(self, attrs):
UPLOAD_TYPE.CASE_QUERY_RESULT_PAGE,
]:
# These are district or bankruptcy court dockets. Is the court valid?
court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
"pk", flat=True
court_ids = (
Court.federal_courts.district_or_bankruptcy_pacer_courts()
)
if attrs["court"].pk not in court_ids:
if not court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not a district or bankruptcy court ID. Did you "
"mean to use the upload_type for appellate dockets?"
Expand All @@ -108,11 +108,9 @@ def validate(self, attrs):
if attrs["upload_type"] == UPLOAD_TYPE.CLAIMS_REGISTER:
# Only allowed on bankruptcy courts
bankruptcy_court_ids = (
Court.federal_courts.bankruptcy_pacer_courts().values_list(
"pk", flat=True
)
Court.federal_courts.bankruptcy_pacer_courts()
)
if attrs["court"].pk not in bankruptcy_court_ids:
if not bankruptcy_court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not a bankruptcy court ID. Only bankruptcy cases "
"should have claims registry pages." % attrs["court"]
Expand All @@ -127,12 +125,8 @@ def validate(self, attrs):
UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE,
]:
# Appellate court dockets. Is the court valid?
appellate_court_ids = (
Court.federal_courts.appellate_pacer_courts().values_list(
"pk", flat=True
)
)
if attrs["court"].pk not in appellate_court_ids:
appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
if not appellate_court_ids.filter(pk=attrs["court"].pk).exists():
raise ValidationError(
"%s is not an appellate court ID. Did you mean to use the "
"upload_type for district dockets?" % attrs["court"]
Expand Down Expand Up @@ -203,11 +197,8 @@ def validate(self, attrs):
mail = attrs["mail"]
receipt = attrs["receipt"]

all_court_ids = Court.federal_courts.all_pacer_courts().values_list(
"pk", flat=True
)

if court_id not in all_court_ids:
all_court_ids = Court.federal_courts.all_pacer_courts()
if not all_court_ids.filter(pk=court_id).exists():
raise ValidationError(
f"{attrs['court'].pk} is not a PACER court ID."
)
Expand Down Expand Up @@ -274,10 +265,9 @@ class Meta:

def validate(self, attrs):
# Is it a good court value?
valid_court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list(
"pk", flat=True
valid_court_ids = (
Court.federal_courts.district_or_bankruptcy_pacer_courts()
)

if (
attrs.get("court")
or attrs.get("docket")
Expand All @@ -293,7 +283,7 @@ def validate(self, attrs):
if attrs.get("court")
else attrs["docket"].court_id
)
if court_id not in valid_court_ids:
if not valid_court_ids.filter(pk=court_id).exists():
raise ValidationError(f"Invalid court id: {court_id}")

# Docket validations
Expand Down
146 changes: 113 additions & 33 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from django.core.files.base import ContentFile, File
from django.core.files.uploadedfile import SimpleUploadedFile
from django.db import IntegrityError, transaction
from django.db.models import QuerySet
from django.utils.timezone import now
from juriscraper.lib.exceptions import PacerLoginException, ParsingException
from juriscraper.lib.string_utils import CaseNameTweaker, harmonize
Expand Down Expand Up @@ -114,7 +115,9 @@ async def process_recap_upload(pq: ProcessingQueue) -> None:
for pq_pk in sub_docket_att_page_pks:
await process_recap_attachment(pq_pk)
elif pq.upload_type == UPLOAD_TYPE.PDF:
await process_recap_pdf(pq.pk)
sub_docket_pdf_pks = await find_subdocket_pdf_rds(pq.pk)
for pq_pk in sub_docket_pdf_pks:
await process_recap_pdf(pq_pk)
elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT:
docket = await process_recap_docket_history_report(pq.pk)
elif pq.upload_type == UPLOAD_TYPE.APPELLATE_DOCKET:
Expand Down Expand Up @@ -676,6 +679,30 @@ async def get_att_data_from_pq(
return pq, att_data, text


def get_main_rds(court_id: str, pacer_doc_id: str) -> QuerySet:
"""
Return the main RECAPDocument queryset for a given court and pacer_doc_id.
:param court_id: The court ID to query.
:param pacer_doc_id: The pacer document ID.
:return: The main RECAPDocument queryset.
"""
main_rds_qs = (
RECAPDocument.objects.select_related("docket_entry__docket")
.filter(
pacer_doc_id=pacer_doc_id,
docket_entry__docket__court_id=court_id,
)
.order_by("docket_entry__docket__pacer_case_id")
.distinct("docket_entry__docket__pacer_case_id")
.only(
"pacer_doc_id",
"docket_entry__docket__pacer_case_id",
"docket_entry__docket__court_id",
)
)
return main_rds_qs


async def find_subdocket_att_page_rds(
pk: int,
) -> list[int]:
Expand All @@ -687,43 +714,100 @@ async def find_subdocket_att_page_rds(
"""

pq = await ProcessingQueue.objects.aget(pk=pk)
court = await Court.objects.aget(id=pq.court_id)
pq, att_data, text = await get_att_data_from_pq(pq)
pacer_doc_id = att_data["pacer_doc_id"]
main_rds = (
RECAPDocument.objects.select_related("docket_entry__docket")
.filter(
pacer_doc_id=pacer_doc_id,
docket_entry__docket__court=court,
)
.order_by("docket_entry__docket__pacer_case_id")
.distinct("docket_entry__docket__pacer_case_id")
.only(
"pacer_doc_id",
"docket_entry__docket__pacer_case_id",
"docket_entry__docket__court_id",
)
.exclude(docket_entry__docket__pacer_case_id=pq.pacer_case_id)
main_rds = get_main_rds(pq.court_id, pacer_doc_id).exclude(
docket_entry__docket__pacer_case_id=pq.pacer_case_id
)
pqs_to_process_pks = [
pq.pk
] # Add the original pq to the list of pqs to process
original_file_content = text.encode("utf-8")
original_file_name = pq.filepath_local.name
async for main_rd in main_rds:
main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = await ProcessingQueue.objects.acreate(
uploader_id=pq.uploader_id,
pacer_doc_id=pacer_doc_id,
pacer_case_id=main_pacer_case_id,
court_id=court.pk,
upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
filepath_local=ContentFile(
original_file_content, name=original_file_name
),

@sync_to_async
def save_pq_instances():
with transaction.atomic():
for main_rd in main_rds:
main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = ProcessingQueue.objects.create(
uploader_id=pq.uploader_id,
pacer_doc_id=pacer_doc_id,
pacer_case_id=main_pacer_case_id,
court_id=pq.court_id,
upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE,
filepath_local=ContentFile(
original_file_content, name=original_file_name
),
)
pqs_to_process_pks.append(pq_created.pk)

await save_pq_instances()
return pqs_to_process_pks


async def find_subdocket_pdf_rds(
pk: int,
) -> list[int]:
"""Look for RECAP Documents that belong to subdockets, and create a PQ
object for each additional PDF upload that requires processing.

:param pk: Primary key of the processing queue item.
:return: A list of ProcessingQueue pks to process.
"""

pq = await ProcessingQueue.objects.aget(pk=pk)
main_rds = get_main_rds(pq.court_id, pq.pacer_doc_id)
pqs_to_process_pks = [
pq.pk
] # Add the original pq to the list of pqs to process

appellate_court_ids = Court.federal_courts.appellate_pacer_courts()
if await appellate_court_ids.filter(pk=pq.court_id).aexists():
# Abort the process for appellate documents. Subdockets cannot be found
# in appellate cases.
return pqs_to_process_pks

if pq.pacer_case_id:
# If pq already has a pacer_case_id, exclude it from the queryset.
main_rds = main_rds.exclude(
docket_entry__docket__pacer_case_id=pq.pacer_case_id
)
pqs_to_process_pks.append(pq_created.pk)

pdf_binary_content = pq.filepath_local.read()

@sync_to_async
def save_pq_instances():
with transaction.atomic():
for i, main_rd in enumerate(main_rds):
if i == 0 and not pq.pacer_case_id:
# If the original PQ does not have a pacer_case_id,
# assign it a pacer_case_id from one of the matched RDs
# to ensure the RD lookup in process_recap_pdf succeeds.
pq.pacer_case_id = (
main_rd.docket_entry.docket.pacer_case_id
)
pq.save()
continue

main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id
# Create additional pqs for each subdocket case found.
pq_created = ProcessingQueue.objects.create(
uploader_id=pq.uploader_id,
pacer_doc_id=pq.pacer_doc_id,
pacer_case_id=main_pacer_case_id,
document_number=pq.document_number,
attachment_number=pq.attachment_number,
court_id=pq.court_id,
upload_type=UPLOAD_TYPE.PDF,
filepath_local=ContentFile(
pdf_binary_content, name=pq.filepath_local.name
),
)
pqs_to_process_pks.append(pq_created.pk)

await save_pq_instances()
return pqs_to_process_pks


Expand All @@ -747,10 +831,6 @@ async def process_recap_attachment(
await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

pq = await ProcessingQueue.objects.aget(pk=pk)
await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS)
logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}")

pq, att_data, text = await get_att_data_from_pq(pq)

if document_number is None:
Expand Down
Loading
Loading