diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index 766da1693..d938ada71 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -29,7 +29,14 @@ ALLOWED_ATTRIBUTES["div"] = {"class", "id"} ALLOWED_ATTRIBUTES["font"] = {"face", "size"} ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"} -ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"} +ALLOWED_ATTRIBUTES["input"] = { + "class", + "id", + "name", + "value", + "type", + "onclick", +} ALLOWED_ATTRIBUTES["span"] = {"class"} ALLOWED_ATTRIBUTES["table"].update({"border", "class"}) ALLOWED_ATTRIBUTES["tr"].add("class") diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py index bb25cbbf1..8cae78c5a 100644 --- a/juriscraper/pacer/appellate_attachment_page.py +++ b/juriscraper/pacer/appellate_attachment_page.py @@ -7,7 +7,10 @@ from juriscraper.lib.html_utils import strip_bad_html_tags_insecure from juriscraper.lib.string_utils import force_unicode -from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url +from juriscraper.pacer.utils import ( + get_pacer_doc_id_from_doc1_url, + parse_sumDocSelected_from_row, +) from ..lib.log_tools import make_default_logger from .reports import BaseReport @@ -97,14 +100,16 @@ def data(self) -> Dict: } for row in rows: - result["attachments"].append( - { - "attachment_number": self._get_attachment_number(row), - "description": self._get_description_from_tr(row), - "page_count": self._get_page_count_from_tr(row), - "pacer_doc_id": self._get_pacer_doc_id(row), - } - ) + attachment = { + "attachment_number": self._get_attachment_number(row), + "description": self._get_description_from_tr(row), + "page_count": self._get_page_count_from_tr(row), + "pacer_doc_id": self._get_pacer_doc_id(row), + } + file_size_bytes = self._get_file_size_bytes_from_tr(row) + if file_size_bytes is not None: + attachment["file_size_bytes"] = file_size_bytes + result["attachments"].append(attachment) return result def _get_main_pacer_doc_id(self): @@ -143,7 +148,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str: row_nodes = row.xpath(".//td") if not row_nodes: return "" - description = row_nodes[-2].xpath("text()") + desc_idx = -2 + if len(row_nodes) == 6: + desc_idx = -3 + description = row_nodes[desc_idx].xpath("text()") if description: return force_unicode(description[0].strip()) return "" @@ -156,11 +164,28 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]: :param row: Table row as an lxml element :return: Attachment page count """ + sum_doc_selected_parts = parse_sumDocSelected_from_row(row) + if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts: + return sum_doc_selected_parts["page_count"] + description_text_nodes = row.xpath(".//td/text()") if not description_text_nodes: return None return int(description_text_nodes[-1].strip()) + @staticmethod + def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]: + """Take a row from the attachment table and return the number of bytes + as an int. + """ + sum_doc_selected_parts = parse_sumDocSelected_from_row(row) + if ( + sum_doc_selected_parts + and "file_size_bytes" in sum_doc_selected_parts + ): + return sum_doc_selected_parts["file_size_bytes"] + return None + @staticmethod def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]: """Take in a row from the attachment table and return the pacer_doc_id diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 424caedca..a4f2c1d03 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -598,6 +598,34 @@ def reverse_goDLS_function(s): return parts +def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]: + """Extract the arguments from the sumDocSelected JavaScript function. + + In: sumDocSelected(this,1,13481, 7548050) + Out: { + 'page_count': 1, + 'file_size_bytes': 13481, + 'doc_id': 7548050 + } + + The key names correspond to the form field names in the JavaScript on PACER: + + - page_count: Number of pages in the document. + - file_size_bytes: Size of the file in bytes. + - doc_id: document ID without court prefix, sometimes called dlsid. + """ + match = re.search(r"sumDocSelected\((.*?)\)", s) + args = [arg.strip() for arg in match.group(1).split(",")] + if args[0] != "this": + return None + parts = { + "page_count": int(args[1]), + "file_size_bytes": int(args[2]), + "doc_id": int(args[3]), + } + return parts + + def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page): """Make a doc1 URL. @@ -806,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime: # Raise an exception if a timezone abbreviation is not specified. raise NotImplementedError(f"Datetime {datetime_str} not understood.") return date_time + + +def parse_sumDocSelected_from_row( + row: html.HtmlElement, +) -> Optional[Dict[str, int]]: + """Parse the arguments from the sumDocSelected function call parts from a + given table row. + + :param row: Table row as an HtmlElement + :return: A dictionary of parsed parameters from the sumDocSelected function, + or None if the row does not contain such data. + """ + + input_els = row.xpath(".//input[@class='selDocCl']") + for input_el in input_els: + onclick = input_el.xpath("./@onclick") + if onclick and "sumDocSelected" in onclick[0]: + return reverse_sumDocSelected_function(onclick[0]) + return None diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json index 942701062..60b8d1892 100644 --- a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json @@ -3,14 +3,16 @@ { "attachment_number": 1, "description": "Main Document", + "file_size_bytes": 13481, "pacer_doc_id": "00107548050", - "page_count": 3 + "page_count": 1 }, { "attachment_number": 2, "description": "", + "file_size_bytes": 8890, "pacer_doc_id": "00107548051", - "page_count": 5 + "page_count": 1 } ], "pacer_case_id": "46307", diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html new file mode 100644 index 000000000..fbc779b90 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html @@ -0,0 +1,139 @@ + +Document + + + +
2 Documents are attached to this filing

+ + +
+ + + + + +
Document DescriptionPagesSize
1Open document Main Document113.17 KB
2Open document 18.68 KB
+
+
+
+Selected Pages: + +  Selected Size: + + + +
Totals reflect accessible documents only and do not include unauthorized restricted documents. +

+
Include Page Numbers
+ + + + + + +
+
+
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json new file mode 100644 index 000000000..60b8d1892 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json @@ -0,0 +1,21 @@ +{ + "attachments": [ + { + "attachment_number": 1, + "description": "Main Document", + "file_size_bytes": 13481, + "pacer_doc_id": "00107548050", + "page_count": 1 + }, + { + "attachment_number": 2, + "description": "", + "file_size_bytes": 8890, + "pacer_doc_id": "00107548051", + "page_count": 1 + } + ], + "pacer_case_id": "46307", + "pacer_doc_id": "00107548050", + "pacer_seq_no": "6315334" +} \ No newline at end of file diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json index fdee7eaa8..212d342e4 100644 --- a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json @@ -3,18 +3,21 @@ { "attachment_number": 1, "description": "Motion Filed on Behalf of Party", + "file_size_bytes": 10293, "pacer_doc_id": "00506485029", "page_count": 4 }, { "attachment_number": 2, "description": "Appellant Brief", + "file_size_bytes": 113319, "pacer_doc_id": "00506485030", "page_count": 30 }, { "attachment_number": 3, "description": "Record Excerpts", + "file_size_bytes": 2155547, "pacer_doc_id": "00506485031", "page_count": 43 } diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html new file mode 100644 index 000000000..9a53e9c4c --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html @@ -0,0 +1,138 @@ +Document + + + +
2 Documents are attached to this filing

+ + +
+ + + + + +
Document DescriptionPagesSize
1Open document Published Opinion13129.46 KB
2Open form OPJDT-2 Letter266.85 KB
+
+
+
+Selected Pages: + +  Selected Size: + + + +
Totals reflect accessible documents only and do not include unauthorized restricted documents. +

+
Include Page Numbers
+ + + + + + +
+
+
\ No newline at end of file diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json new file mode 100644 index 000000000..163da0fc5 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json @@ -0,0 +1,21 @@ +{ + "attachments": [ + { + "attachment_number": 1, + "description": "Published Opinion", + "file_size_bytes": 132568, + "pacer_doc_id": "00507148063", + "page_count": 13 + }, + { + "attachment_number": 2, + "description": "OPJDT-2 Letter", + "file_size_bytes": 68451, + "pacer_doc_id": "00507148074", + "page_count": 2 + } + ], + "pacer_case_id": "210055", + "pacer_doc_id": "00507148063", + "pacer_seq_no": "10348292" +} \ No newline at end of file