Skip to content

Commit

Permalink
Merge pull request #1240 from ttys0dev/appellate-attachment-new
Browse files Browse the repository at this point in the history
Handle new appellate attachment page format
  • Loading branch information
albertisfu authored Dec 10, 2024
2 parents 1b1727e + 00f02b2 commit b2197aa
Show file tree
Hide file tree
Showing 9 changed files with 416 additions and 13 deletions.
9 changes: 8 additions & 1 deletion juriscraper/lib/html_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@
ALLOWED_ATTRIBUTES["div"] = {"class", "id"}
ALLOWED_ATTRIBUTES["font"] = {"face", "size"}
ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"}
ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"}
ALLOWED_ATTRIBUTES["input"] = {
"class",
"id",
"name",
"value",
"type",
"onclick",
}
ALLOWED_ATTRIBUTES["span"] = {"class"}
ALLOWED_ATTRIBUTES["table"].update({"border", "class"})
ALLOWED_ATTRIBUTES["tr"].add("class")
Expand Down
45 changes: 35 additions & 10 deletions juriscraper/pacer/appellate_attachment_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import force_unicode
from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url
from juriscraper.pacer.utils import (
get_pacer_doc_id_from_doc1_url,
parse_sumDocSelected_from_row,
)

from ..lib.log_tools import make_default_logger
from .reports import BaseReport
Expand Down Expand Up @@ -97,14 +100,16 @@ def data(self) -> Dict:
}

for row in rows:
result["attachments"].append(
{
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
}
)
attachment = {
"attachment_number": self._get_attachment_number(row),
"description": self._get_description_from_tr(row),
"page_count": self._get_page_count_from_tr(row),
"pacer_doc_id": self._get_pacer_doc_id(row),
}
file_size_bytes = self._get_file_size_bytes_from_tr(row)
if file_size_bytes is not None:
attachment["file_size_bytes"] = file_size_bytes
result["attachments"].append(attachment)
return result

def _get_main_pacer_doc_id(self):
Expand Down Expand Up @@ -143,7 +148,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str:
row_nodes = row.xpath(".//td")
if not row_nodes:
return ""
description = row_nodes[-2].xpath("text()")
desc_idx = -2
if len(row_nodes) == 6:
desc_idx = -3
description = row_nodes[desc_idx].xpath("text()")
if description:
return force_unicode(description[0].strip())
return ""
Expand All @@ -156,11 +164,28 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
:param row: Table row as an lxml element
:return: Attachment page count
"""
sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts:
return sum_doc_selected_parts["page_count"]

description_text_nodes = row.xpath(".//td/text()")
if not description_text_nodes:
return None
return int(description_text_nodes[-1].strip())

@staticmethod
def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
if (
sum_doc_selected_parts
and "file_size_bytes" in sum_doc_selected_parts
):
return sum_doc_selected_parts["file_size_bytes"]
return None

@staticmethod
def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]:
"""Take in a row from the attachment table and return the pacer_doc_id
Expand Down
47 changes: 47 additions & 0 deletions juriscraper/pacer/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,34 @@ def reverse_goDLS_function(s):
return parts


def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]:
"""Extract the arguments from the sumDocSelected JavaScript function.
In: sumDocSelected(this,1,13481, 7548050)
Out: {
'page_count': 1,
'file_size_bytes': 13481,
'doc_id': 7548050
}
The key names correspond to the form field names in the JavaScript on PACER:
- page_count: Number of pages in the document.
- file_size_bytes: Size of the file in bytes.
- doc_id: document ID without court prefix, sometimes called dlsid.
"""
match = re.search(r"sumDocSelected\((.*?)\)", s)
args = [arg.strip() for arg in match.group(1).split(",")]
if args[0] != "this":
return None
parts = {
"page_count": int(args[1]),
"file_size_bytes": int(args[2]),
"doc_id": int(args[3]),
}
return parts


def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page):
"""Make a doc1 URL.
Expand Down Expand Up @@ -806,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime:
# Raise an exception if a timezone abbreviation is not specified.
raise NotImplementedError(f"Datetime {datetime_str} not understood.")
return date_time


def parse_sumDocSelected_from_row(
row: html.HtmlElement,
) -> Optional[Dict[str, int]]:
"""Parse the arguments from the sumDocSelected function call parts from a
given table row.
:param row: Table row as an HtmlElement
:return: A dictionary of parsed parameters from the sumDocSelected function,
or None if the row does not contain such data.
"""

input_els = row.xpath(".//input[@class='selDocCl']")
for input_el in input_els:
onclick = input_el.xpath("./@onclick")
if onclick and "sumDocSelected" in onclick[0]:
return reverse_sumDocSelected_function(onclick[0])
return None
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
{
"attachment_number": 1,
"description": "Main Document",
"file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
"page_count": 3
"page_count": 1
},
{
"attachment_number": 2,
"description": "",
"file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
"page_count": 5
"page_count": 1
}
],
"pacer_case_id": "46307",
Expand Down
139 changes: 139 additions & 0 deletions tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
<!DOCTYPE html>
<HTML><HEAD><TITLE>Document</TITLE><meta http-equiv="X-UA-Compatible" content="IE=edge" /></HEAD>
<BODY BGCOLOR=#FFFFDA>
<table border=0 align=center>
<tr><th align=center><b>2 Documents are attached to this filing</b><br><br></th></tr>
</table>
<table border=1 align=center cellpadding=5>
<tr><td>
<table border=0 cellpadding=5>
<tr><th align=center colspan=3>Document</th> <th align=center>Description</th><th align=center>Pages</th>
<th align=center>Size</th></tr>
<TR><TD ALIGN=center><INPUT class='selDocCl' TYPE='checkbox' NAME='md' VALUE='7548050' ONCLICK='sumDocSelected(this,1,13481, 7548050)' CHECKED></TD><td align=center>1</td><td align=center><a HREF='https://ecf.ca1.uscourts.gov/docs1/00117548050' ONCLICK="return doDocPostURL('00117548050')"><img title='Open Document' width='13' height='15' BORDER=2 SRC='TransportRoom?servlet=document.gif' ALT='Open document' ></a>&nbsp;</td><td>Main Document</td><td align=center>1</td><td align=center>13.17 KB</td></tr>
<TR><TD ALIGN=center><INPUT class='selDocCl' TYPE='checkbox' NAME='md' VALUE='7548051' ONCLICK='sumDocSelected(this,1,8890, 7548051)' CHECKED></TD><td align=center>2</td><td align=center><a HREF='https://ecf.ca1.uscourts.gov/docs1/00117548051' ONCLICK="return doDocPostURL('00117548051')"><img title='Open Document' width='13' height='15' BORDER=2 SRC='TransportRoom?servlet=document.gif' ALT='Open document' ></a>&nbsp;</td><td></td><td align=center>1</td><td align=center>8.68 KB</td></tr>
</table>
</td></tr>
</table>
<BR>
<CENTER>
<b>Selected Pages:
<input id='totPageCntFld' type='text' size='10' value='2' onfocus='this.blur();' />
&nbsp;&nbsp;Selected Size:
<input id='totByteSizeFld' type='text' size='10' value='21.85 KB' onfocus='this.blur();' />
</b>

<BR><b>Totals reflect accessible documents only and do not include unauthorized restricted documents.</b>
<BR><BR>
<div id='opts'><FORM name='dktEntry' action=''><INPUT Value='y' TYPE='checkbox' name='incPdfFooter'><B>Include Page Numbers</B><BR>
<INPUT NAME='viewSelBtn' Value='View Selected' TYPE='button' ONCLICK='ProcessForm("view")'>
<input type=hidden id='totPageHFld' name='totPageFld' value='2'>
<input type=hidden id='totBytesHFld' value='22371'>
<input type=hidden id='dynaTotPageId' name='dynaTotPageFld' value='2'>
<input type=hidden id='dynaTotBytesId' name='dynaTotBytesFld' value='22371'>
<input type=hidden id='pageLoadHFld' value='1'>
</FORM>
</div><div id='noOpts' style='visibility:hidden'><b>Selected documents cannot be combined due to size. Please remove some selections to be below 250 MB.</b></div><SCRIPT type='text/javascript'>
<!--
var winOptions = 'location=no,resizable,toolbar,status,scrollbars';
var winTarget = '_blank';
function doDocPostURL(dls) {
// This use of a function when a user clicks a document
// link was done so params aren't in copied doc hyperlinks.
// This allows user to right click & get the URL for copying doc
// links, but still gets params back to the server as needed
var aWin = window.open('TransportRoom?servlet=ShowDoc&caseId=46307&dls_id='+dls+'&caseId=46307',winTarget,winOptions,false);
return false;
}

window.onload = function() {
}

window.onpageshow = function(event) {
if(performance.navigation.type == 2){
location.reload(true);
}
}

var dlsIdArr = [7548050,7548051];
function getSize(bytes) {
if (bytes >= 1048576) {
return Math.round(bytes/1048576*100)/100 + ' MB';
}
return Math.round(bytes/1024*100)/100 + ' KB';
}
function tooManyDocs(disable) {
if (disable) {
hideOpts();
if (document.dktEntry.viewSelBtn.disabled == false) {
document.dktEntry.viewSelBtn.disabled = true;
alert('Too many documents are selected: ' + getSize(totNumByte) + '. Please remove some selections to be below 250 MB.');
}
} else {
showOpts();
document.dktEntry.viewSelBtn.disabled = false;
}
}

var totNumPage = 0.0;
var totNumByte = 0.0;
var maxSize = 0;
function sumDocSelected(aField, numPage, numByte, docId) {
pageLoadIndexUpdate();
if (aField.checked == true) {
dlsIdArr.push(docId); totNumPage = parseInt(totNumPage) + parseInt(numPage);
totNumByte = parseInt(totNumByte) + parseInt(numByte);
} else {
var dlsIndex = dlsIdArr.indexOf(docId);
dlsIdArr.splice(dlsIndex, 1);
totNumPage = parseInt(totNumPage) - parseInt(numPage);
totNumByte = parseInt(totNumByte) - parseInt(numByte);
}
document.getElementById('totPageCntFld').value = totNumPage;
document.getElementById('totByteSizeFld').value = getSize(totNumByte);
document.dktEntry.dynaTotPageFld.value = totNumPage;
document.dktEntry.dynaTotBytesFld.value = getSize(totNumByte);
tooManyDocs(totNumByte > 262144000);
return false;
}

function pageLoadIndexUpdate() {
var pageLoadIndex = document.getElementById('pageLoadHFld').value;
if (pageLoadIndex == 1) {
totNumByte = document.getElementById('totBytesHFld').value;
totNumPage = document.getElementById('totPageHFld').value;
document.getElementById('pageLoadHFld').value = 2;
}
return false;
}

function showOpts() {
document.getElementById('opts').style.pointerEvents = 'auto';
document.getElementById('opts').style.opacity = '1';
document.getElementById('noOpts').style.visibility = 'hidden';
}

function hideOpts() {
document.getElementById('opts').style.pointerEvents = 'none';
document.getElementById('opts').style.opacity = '0.4';
document.getElementById('noOpts').style.visibility = 'visible';
}

function ProcessForm(uChoice) {
pageLoadIndexUpdate();
if (totNumByte == 0) {
alert('No entries with accessible documents were selected. Please select the desired documents before proceeding.');
return false;
}
if(uChoice=='view'){
var incFoot = '';
if (document.dktEntry.incPdfFooter.checked) {
incFoot = 'y';
}
window.location='TransportRoom?servlet=ShowDocMulti&caseId=46307&outputType=doc&d=6315334&outputForm=view&incPdfFooter='+incFoot+'&dls='+dlsIdArr.join();
}
return false;
}

//-->
</SCRIPT>
</CENTER></BODY></HTML>
21 changes: 21 additions & 0 deletions tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"attachments": [
{
"attachment_number": 1,
"description": "Main Document",
"file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
"page_count": 1
},
{
"attachment_number": 2,
"description": "",
"file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
"page_count": 1
}
],
"pacer_case_id": "46307",
"pacer_doc_id": "00107548050",
"pacer_seq_no": "6315334"
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@
{
"attachment_number": 1,
"description": "Motion Filed on Behalf of Party",
"file_size_bytes": 10293,
"pacer_doc_id": "00506485029",
"page_count": 4
},
{
"attachment_number": 2,
"description": "Appellant Brief",
"file_size_bytes": 113319,
"pacer_doc_id": "00506485030",
"page_count": 30
},
{
"attachment_number": 3,
"description": "Record Excerpts",
"file_size_bytes": 2155547,
"pacer_doc_id": "00506485031",
"page_count": 43
}
Expand Down
Loading

0 comments on commit b2197aa

Please sign in to comment.