Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle new appellate attachment page format #1240

Merged
merged 4 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion juriscraper/lib/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
ALLOWED_ATTRIBUTES["div"] = {"class", "id"}
ALLOWED_ATTRIBUTES["font"] = {"face", "size"}
ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"}
ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"}
ALLOWED_ATTRIBUTES["input"] = {
"class",
"id",
"name",
"value",
"type",
"onclick",
}
ALLOWED_ATTRIBUTES["span"] = {"class"}
ALLOWED_ATTRIBUTES["table"].update({"border", "class"})
ALLOWED_ATTRIBUTES["tr"].add("class")
Expand Down
45 changes: 35 additions & 10 deletions juriscraper/pacer/appellate_attachment_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import force_unicode
from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url
from juriscraper.pacer.utils import (
get_pacer_doc_id_from_doc1_url,
parse_sumDocSelected_from_row,
)

from ..lib.log_tools import make_default_logger
from .reports import BaseReport
Expand Down Expand Up @@ -97,14 +100,16 @@ def data(self) -> Dict:
}

for row in rows:
result["attachments"].append(
{
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
}
)
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
}
file_size_bytes = self._get_file_size_bytes_from_tr(row)
if file_size_bytes is not None:
attachment["file_size_bytes"] = file_size_bytes
result["attachments"].append(attachment)
return result

def _get_main_pacer_doc_id(self):
Expand Down Expand Up @@ -143,7 +148,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str:
row_nodes = row.xpath(".//td")
if not row_nodes:
return ""
description = row_nodes[-2].xpath("text()")
desc_idx = -2
if len(row_nodes) == 6:
desc_idx = -3
description = row_nodes[desc_idx].xpath("text()")
if description:
return force_unicode(description[0].strip())
return ""
Expand All @@ -156,11 +164,28 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
:param row: Table row as an lxml element
:return: Attachment page count
"""
sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts:
return sum_doc_selected_parts["page_count"]

description_text_nodes = row.xpath(".//td/text()")
if not description_text_nodes:
return None
return int(description_text_nodes[-1].strip())

@staticmethod
def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
if (
sum_doc_selected_parts
and "file_size_bytes" in sum_doc_selected_parts
):
return sum_doc_selected_parts["file_size_bytes"]
return None

@staticmethod
def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]:
"""Take in a row from the attachment table and return the pacer_doc_id
Expand Down
47 changes: 47 additions & 0 deletions juriscraper/pacer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,34 @@ def reverse_goDLS_function(s):
return parts


def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]:
"""Extract the arguments from the sumDocSelected JavaScript function.

In: sumDocSelected(this,1,13481, 7548050)
Out: {
'page_count': 1,
'file_size_bytes': 13481,
'doc_id': 7548050
}

The key names correspond to the form field names in the JavaScript on PACER:

- page_count: Number of pages in the document.
- file_size_bytes: Size of the file in bytes.
- doc_id: document ID without court prefix, sometimes called dlsid.
"""
match = re.search(r"sumDocSelected\((.*?)\)", s)
args = [arg.strip() for arg in match.group(1).split(",")]
if args[0] != "this":
return None
parts = {
"page_count": int(args[1]),
"file_size_bytes": int(args[2]),
"doc_id": int(args[3]),
}
return parts


def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page):
"""Make a doc1 URL.

Expand Down Expand Up @@ -806,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime:
# Raise an exception if a timezone abbreviation is not specified.
raise NotImplementedError(f"Datetime {datetime_str} not understood.")
return date_time


def parse_sumDocSelected_from_row(
row: html.HtmlElement,
) -> Optional[Dict[str, int]]:
"""Parse the arguments from the sumDocSelected function call parts from a
given table row.

:param row: Table row as an HtmlElement
:return: A dictionary of parsed parameters from the sumDocSelected function,
or None if the row does not contain such data.
"""

input_els = row.xpath(".//input[@class='selDocCl']")
for input_el in input_els:
onclick = input_el.xpath("./@onclick")
if onclick and "sumDocSelected" in onclick[0]:
return reverse_sumDocSelected_function(onclick[0])
return None
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
{
"attachment_number": 1,
"description": "Main Document",
"file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
"page_count": 3
"page_count": 1
},
{
"attachment_number": 2,
"description": "",
"file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
"page_count": 5
"page_count": 1
}
],
"pacer_case_id": "46307",
Expand Down
139 changes: 139 additions & 0 deletions tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
<!DOCTYPE html>
<HTML><HEAD><TITLE>Document</TITLE><meta http-equiv="X-UA-Compatible" content="IE=edge" /></HEAD>
<BODY BGCOLOR=#FFFFDA>
<table border=0 align=center>
<tr><th align=center><b>2 Documents are attached to this filing</b><br><br></th></tr>
</table>
<table border=1 align=center cellpadding=5>
<tr><td>
<table border=0 cellpadding=5>
<tr><th align=center colspan=3>Document</th> <th align=center>Description</th><th align=center>Pages</th>
<th align=center>Size</th></tr>
<TR><TD ALIGN=center><INPUT class='selDocCl' TYPE='checkbox' NAME='md' VALUE='7548050' ONCLICK='sumDocSelected(this,1,13481, 7548050)' CHECKED></TD><td align=center>1</td><td align=center><a HREF='https://ecf.ca1.uscourts.gov/docs1/00117548050' ONCLICK="return doDocPostURL('00117548050')"><img title='Open Document' width='13' height='15' BORDER=2 SRC='TransportRoom?servlet=document.gif' ALT='Open document' ></a>&nbsp;</td><td>Main Document</td><td align=center>1</td><td align=center>13.17 KB</td></tr>
<TR><TD ALIGN=center><INPUT class='selDocCl' TYPE='checkbox' NAME='md' VALUE='7548051' ONCLICK='sumDocSelected(this,1,8890, 7548051)' CHECKED></TD><td align=center>2</td><td align=center><a HREF='https://ecf.ca1.uscourts.gov/docs1/00117548051' ONCLICK="return doDocPostURL('00117548051')"><img title='Open Document' width='13' height='15' BORDER=2 SRC='TransportRoom?servlet=document.gif' ALT='Open document' ></a>&nbsp;</td><td></td><td align=center>1</td><td align=center>8.68 KB</td></tr>
</table>
</td></tr>
</table>
<BR>
<CENTER>
<b>Selected Pages:
<input id='totPageCntFld' type='text' size='10' value='2' onfocus='this.blur();' />
&nbsp;&nbsp;Selected Size:
<input id='totByteSizeFld' type='text' size='10' value='21.85 KB' onfocus='this.blur();' />
</b>

<BR><b>Totals reflect accessible documents only and do not include unauthorized restricted documents.</b>
<BR><BR>
<div id='opts'><FORM name='dktEntry' action=''><INPUT Value='y' TYPE='checkbox' name='incPdfFooter'><B>Include Page Numbers</B><BR>
<INPUT NAME='viewSelBtn' Value='View Selected' TYPE='button' ONCLICK='ProcessForm("view")'>
<input type=hidden id='totPageHFld' name='totPageFld' value='2'>
<input type=hidden id='totBytesHFld' value='22371'>
<input type=hidden id='dynaTotPageId' name='dynaTotPageFld' value='2'>
<input type=hidden id='dynaTotBytesId' name='dynaTotBytesFld' value='22371'>
<input type=hidden id='pageLoadHFld' value='1'>
</FORM>
</div><div id='noOpts' style='visibility:hidden'><b>Selected documents cannot be combined due to size. Please remove some selections to be below 250 MB.</b></div><SCRIPT type='text/javascript'>
<!--
var winOptions = 'location=no,resizable,toolbar,status,scrollbars';
var winTarget = '_blank';
function doDocPostURL(dls) {
// This use of a function when a user clicks a document
// link was done so params aren't in copied doc hyperlinks.
// This allows user to right click & get the URL for copying doc
// links, but still gets params back to the server as needed
var aWin = window.open('TransportRoom?servlet=ShowDoc&caseId=46307&dls_id='+dls+'&caseId=46307',winTarget,winOptions,false);
return false;
}

window.onload = function() {
}

window.onpageshow = function(event) {
if(performance.navigation.type == 2){
location.reload(true);
}
}

var dlsIdArr = [7548050,7548051];
function getSize(bytes) {
if (bytes >= 1048576) {
return Math.round(bytes/1048576*100)/100 + ' MB';
}
return Math.round(bytes/1024*100)/100 + ' KB';
}
function tooManyDocs(disable) {
if (disable) {
hideOpts();
if (document.dktEntry.viewSelBtn.disabled == false) {
document.dktEntry.viewSelBtn.disabled = true;
alert('Too many documents are selected: ' + getSize(totNumByte) + '. Please remove some selections to be below 250 MB.');
}
} else {
showOpts();
document.dktEntry.viewSelBtn.disabled = false;
}
}

var totNumPage = 0.0;
var totNumByte = 0.0;
var maxSize = 0;
function sumDocSelected(aField, numPage, numByte, docId) {
pageLoadIndexUpdate();
if (aField.checked == true) {
dlsIdArr.push(docId); totNumPage = parseInt(totNumPage) + parseInt(numPage);
totNumByte = parseInt(totNumByte) + parseInt(numByte);
} else {
var dlsIndex = dlsIdArr.indexOf(docId);
dlsIdArr.splice(dlsIndex, 1);
totNumPage = parseInt(totNumPage) - parseInt(numPage);
totNumByte = parseInt(totNumByte) - parseInt(numByte);
}
document.getElementById('totPageCntFld').value = totNumPage;
document.getElementById('totByteSizeFld').value = getSize(totNumByte);
document.dktEntry.dynaTotPageFld.value = totNumPage;
document.dktEntry.dynaTotBytesFld.value = getSize(totNumByte);
tooManyDocs(totNumByte > 262144000);
return false;
}

function pageLoadIndexUpdate() {
var pageLoadIndex = document.getElementById('pageLoadHFld').value;
if (pageLoadIndex == 1) {
totNumByte = document.getElementById('totBytesHFld').value;
totNumPage = document.getElementById('totPageHFld').value;
document.getElementById('pageLoadHFld').value = 2;
}
return false;
}

function showOpts() {
document.getElementById('opts').style.pointerEvents = 'auto';
document.getElementById('opts').style.opacity = '1';
document.getElementById('noOpts').style.visibility = 'hidden';
}

function hideOpts() {
document.getElementById('opts').style.pointerEvents = 'none';
document.getElementById('opts').style.opacity = '0.4';
document.getElementById('noOpts').style.visibility = 'visible';
}

function ProcessForm(uChoice) {
pageLoadIndexUpdate();
if (totNumByte == 0) {
alert('No entries with accessible documents were selected. Please select the desired documents before proceeding.');
return false;
}
if(uChoice=='view'){
var incFoot = '';
if (document.dktEntry.incPdfFooter.checked) {
incFoot = 'y';
}
window.location='TransportRoom?servlet=ShowDocMulti&caseId=46307&outputType=doc&d=6315334&outputForm=view&incPdfFooter='+incFoot+'&dls='+dlsIdArr.join();
}
return false;
}

//-->
</SCRIPT>
</CENTER></BODY></HTML>
21 changes: 21 additions & 0 deletions tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"attachments": [
{
"attachment_number": 1,
"description": "Main Document",
"file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
"page_count": 1
},
{
"attachment_number": 2,
"description": "",
"file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
"page_count": 1
}
],
"pacer_case_id": "46307",
"pacer_doc_id": "00107548050",
"pacer_seq_no": "6315334"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@
{
"attachment_number": 1,
"description": "Motion Filed on Behalf of Party",
"file_size_bytes": 10293,
"pacer_doc_id": "00506485029",
"page_count": 4
},
{
"attachment_number": 2,
"description": "Appellant Brief",
"file_size_bytes": 113319,
"pacer_doc_id": "00506485030",
"page_count": 30
},
{
"attachment_number": 3,
"description": "Record Excerpts",
"file_size_bytes": 2155547,
"pacer_doc_id": "00506485031",
"page_count": 43
}
Expand Down
Loading
Loading