diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py
index 766da1693..d938ada71 100644
--- a/juriscraper/lib/html_utils.py
+++ b/juriscraper/lib/html_utils.py
@@ -29,7 +29,14 @@
ALLOWED_ATTRIBUTES["div"] = {"class", "id"}
ALLOWED_ATTRIBUTES["font"] = {"face", "size"}
ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"}
-ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"}
+ALLOWED_ATTRIBUTES["input"] = {
+ "class",
+ "id",
+ "name",
+ "value",
+ "type",
+ "onclick",
+}
ALLOWED_ATTRIBUTES["span"] = {"class"}
ALLOWED_ATTRIBUTES["table"].update({"border", "class"})
ALLOWED_ATTRIBUTES["tr"].add("class")
diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py
index bb25cbbf1..8cae78c5a 100644
--- a/juriscraper/pacer/appellate_attachment_page.py
+++ b/juriscraper/pacer/appellate_attachment_page.py
@@ -7,7 +7,10 @@
from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import force_unicode
-from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url
+from juriscraper.pacer.utils import (
+ get_pacer_doc_id_from_doc1_url,
+ parse_sumDocSelected_from_row,
+)
from ..lib.log_tools import make_default_logger
from .reports import BaseReport
@@ -97,14 +100,16 @@ def data(self) -> Dict:
}
for row in rows:
- result["attachments"].append(
- {
- "attachment_number": self._get_attachment_number(row),
- "description": self._get_description_from_tr(row),
- "page_count": self._get_page_count_from_tr(row),
- "pacer_doc_id": self._get_pacer_doc_id(row),
- }
- )
+ attachment = {
+ "attachment_number": self._get_attachment_number(row),
+ "description": self._get_description_from_tr(row),
+ "page_count": self._get_page_count_from_tr(row),
+ "pacer_doc_id": self._get_pacer_doc_id(row),
+ }
+ file_size_bytes = self._get_file_size_bytes_from_tr(row)
+ if file_size_bytes is not None:
+ attachment["file_size_bytes"] = file_size_bytes
+ result["attachments"].append(attachment)
return result
def _get_main_pacer_doc_id(self):
@@ -143,7 +148,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str:
row_nodes = row.xpath(".//td")
if not row_nodes:
return ""
- description = row_nodes[-2].xpath("text()")
+ desc_idx = -2
+ if len(row_nodes) == 6:
+ desc_idx = -3
+ description = row_nodes[desc_idx].xpath("text()")
if description:
return force_unicode(description[0].strip())
return ""
@@ -156,11 +164,28 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
:param row: Table row as an lxml element
:return: Attachment page count
"""
+ sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
+ if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts:
+ return sum_doc_selected_parts["page_count"]
+
description_text_nodes = row.xpath(".//td/text()")
if not description_text_nodes:
return None
return int(description_text_nodes[-1].strip())
+ @staticmethod
+ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
+ """Take a row from the attachment table and return the number of bytes
+ as an int.
+ """
+ sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
+ if (
+ sum_doc_selected_parts
+ and "file_size_bytes" in sum_doc_selected_parts
+ ):
+ return sum_doc_selected_parts["file_size_bytes"]
+ return None
+
@staticmethod
def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]:
"""Take in a row from the attachment table and return the pacer_doc_id
diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
index 424caedca..a4f2c1d03 100644
--- a/juriscraper/pacer/utils.py
+++ b/juriscraper/pacer/utils.py
@@ -598,6 +598,34 @@ def reverse_goDLS_function(s):
return parts
+def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]:
+ """Extract the arguments from the sumDocSelected JavaScript function.
+
+ In: sumDocSelected(this,1,13481, 7548050)
+ Out: {
+ 'page_count': 1,
+ 'file_size_bytes': 13481,
+ 'doc_id': 7548050
+ }
+
+ The key names correspond to the form field names in the JavaScript on PACER:
+
+ - page_count: Number of pages in the document.
+ - file_size_bytes: Size of the file in bytes.
+ - doc_id: document ID without court prefix, sometimes called dlsid.
+ """
+ match = re.search(r"sumDocSelected\((.*?)\)", s)
+ args = [arg.strip() for arg in match.group(1).split(",")]
+ if args[0] != "this":
+ return None
+ parts = {
+ "page_count": int(args[1]),
+ "file_size_bytes": int(args[2]),
+ "doc_id": int(args[3]),
+ }
+ return parts
+
+
def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page):
"""Make a doc1 URL.
@@ -806,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime:
# Raise an exception if a timezone abbreviation is not specified.
raise NotImplementedError(f"Datetime {datetime_str} not understood.")
return date_time
+
+
+def parse_sumDocSelected_from_row(
+ row: html.HtmlElement,
+) -> Optional[Dict[str, int]]:
+ """Parse the arguments from the sumDocSelected function call parts from a
+ given table row.
+
+ :param row: Table row as an HtmlElement
+ :return: A dictionary of parsed parameters from the sumDocSelected function,
+ or None if the row does not contain such data.
+ """
+
+ input_els = row.xpath(".//input[@class='selDocCl']")
+ for input_el in input_els:
+ onclick = input_el.xpath("./@onclick")
+ if onclick and "sumDocSelected" in onclick[0]:
+ return reverse_sumDocSelected_function(onclick[0])
+ return None
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
index 942701062..60b8d1892 100644
--- a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
@@ -3,14 +3,16 @@
{
"attachment_number": 1,
"description": "Main Document",
+ "file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
- "page_count": 3
+ "page_count": 1
},
{
"attachment_number": 2,
"description": "",
+ "file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
- "page_count": 5
+ "page_count": 1
}
],
"pacer_case_id": "46307",
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
new file mode 100644
index 000000000..fbc779b90
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
@@ -0,0 +1,139 @@
+
+
Document
+
+
+2 Documents are attached to this filing
|
+
+
+
+
+Selected Pages:
+
+ Selected Size:
+
+
+
+
Totals reflect accessible documents only and do not include unauthorized restricted documents.
+
+
+
Selected documents cannot be combined due to size. Please remove some selections to be below 250 MB.
+
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
new file mode 100644
index 000000000..60b8d1892
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
@@ -0,0 +1,21 @@
+{
+ "attachments": [
+ {
+ "attachment_number": 1,
+ "description": "Main Document",
+ "file_size_bytes": 13481,
+ "pacer_doc_id": "00107548050",
+ "page_count": 1
+ },
+ {
+ "attachment_number": 2,
+ "description": "",
+ "file_size_bytes": 8890,
+ "pacer_doc_id": "00107548051",
+ "page_count": 1
+ }
+ ],
+ "pacer_case_id": "46307",
+ "pacer_doc_id": "00107548050",
+ "pacer_seq_no": "6315334"
+}
\ No newline at end of file
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
index fdee7eaa8..212d342e4 100644
--- a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
@@ -3,18 +3,21 @@
{
"attachment_number": 1,
"description": "Motion Filed on Behalf of Party",
+ "file_size_bytes": 10293,
"pacer_doc_id": "00506485029",
"page_count": 4
},
{
"attachment_number": 2,
"description": "Appellant Brief",
+ "file_size_bytes": 113319,
"pacer_doc_id": "00506485030",
"page_count": 30
},
{
"attachment_number": 3,
"description": "Record Excerpts",
+ "file_size_bytes": 2155547,
"pacer_doc_id": "00506485031",
"page_count": 43
}
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html
new file mode 100644
index 000000000..9a53e9c4c
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html
@@ -0,0 +1,138 @@
+Document
+
+
+2 Documents are attached to this filing
|
+
+
+
+
+Selected Pages:
+
+ Selected Size:
+
+
+
+
Totals reflect accessible documents only and do not include unauthorized restricted documents.
+
+
+
Selected documents cannot be combined due to size. Please remove some selections to be below 20 MB.
+
\ No newline at end of file
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json
new file mode 100644
index 000000000..163da0fc5
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json
@@ -0,0 +1,21 @@
+{
+ "attachments": [
+ {
+ "attachment_number": 1,
+ "description": "Published Opinion",
+ "file_size_bytes": 132568,
+ "pacer_doc_id": "00507148063",
+ "page_count": 13
+ },
+ {
+ "attachment_number": 2,
+ "description": "OPJDT-2 Letter",
+ "file_size_bytes": 68451,
+ "pacer_doc_id": "00507148074",
+ "page_count": 2
+ }
+ ],
+ "pacer_case_id": "210055",
+ "pacer_doc_id": "00507148063",
+ "pacer_seq_no": "10348292"
+}
\ No newline at end of file