From 15def3f8755be46b58526bff25cfbd7f25b31e4b Mon Sep 17 00:00:00 2001 From: ttys0dev <126845556+ttys0dev@users.noreply.github.com> Date: Sat, 16 Nov 2024 22:54:01 +0200 Subject: [PATCH 1/3] Handle new appellate attachment page format --- juriscraper/lib/html_utils.py | 2 +- .../pacer/appellate_attachment_page.py | 55 +++++-- juriscraper/pacer/utils.py | 28 ++++ .../appellate_attachment_pages/ca1_46307.json | 6 +- .../ca1_46307_new.html | 139 ++++++++++++++++++ .../ca1_46307_new.json | 21 +++ .../ca5_22-30311.json | 3 + 7 files changed, 241 insertions(+), 13 deletions(-) create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index 766da1693..b99a68d02 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -29,7 +29,7 @@ ALLOWED_ATTRIBUTES["div"] = {"class", "id"} ALLOWED_ATTRIBUTES["font"] = {"face", "size"} ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"} -ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"} +ALLOWED_ATTRIBUTES["input"] = {"class", "id", "name", "value", "type", "onclick"} ALLOWED_ATTRIBUTES["span"] = {"class"} ALLOWED_ATTRIBUTES["table"].update({"border", "class"}) ALLOWED_ATTRIBUTES["tr"].add("class") diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py index bb25cbbf1..f47782efa 100644 --- a/juriscraper/pacer/appellate_attachment_page.py +++ b/juriscraper/pacer/appellate_attachment_page.py @@ -7,7 +7,8 @@ from juriscraper.lib.html_utils import strip_bad_html_tags_insecure from juriscraper.lib.string_utils import force_unicode -from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url +from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url, \ + reverse_sumDocSelected_function from ..lib.log_tools import make_default_logger from .reports import BaseReport @@ -97,14 +98,16 @@ def data(self) -> Dict: } for row in rows: - result["attachments"].append( - { - "attachment_number": self._get_attachment_number(row), - "description": self._get_description_from_tr(row), - "page_count": self._get_page_count_from_tr(row), - "pacer_doc_id": self._get_pacer_doc_id(row), - } - ) + attachment = { + "attachment_number": self._get_attachment_number(row), + "description": self._get_description_from_tr(row), + "page_count": self._get_page_count_from_tr(row), + "pacer_doc_id": self._get_pacer_doc_id(row), + } + file_size_bytes = self._get_file_size_bytes_from_tr(row) + if file_size_bytes is not None: + attachment["file_size_bytes"] = file_size_bytes + result["attachments"].append(attachment) return result def _get_main_pacer_doc_id(self): @@ -143,7 +146,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str: row_nodes = row.xpath(".//td") if not row_nodes: return "" - description = row_nodes[-2].xpath("text()") + desc_idx = -2 + if len(row_nodes) == 6: + desc_idx = -3 + description = row_nodes[desc_idx].xpath("text()") if description: return force_unicode(description[0].strip()) return "" @@ -156,11 +162,40 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]: :param row: Table row as an lxml element :return: Attachment page count """ + input_els = row.xpath(".//input[@class='selDocCl']") + for input_el in input_els: + try: + onclick = input_el.xpath("./@onclick") + if onclick and "sumDocSelected" in onclick[0]: + sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0]) + if sum_doc_selected_parts: + return sum_doc_selected_parts["page_count"] + except IndexError: + continue + description_text_nodes = row.xpath(".//td/text()") if not description_text_nodes: return None return int(description_text_nodes[-1].strip()) + @staticmethod + def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]: + """Take a row from the attachment table and return the number of bytes + as an int. + """ + input_els = row.xpath(".//input[@class='selDocCl']") + for input_el in input_els: + try: + onclick = input_el.xpath("./@onclick") + if onclick and "sumDocSelected" in onclick[0]: + sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0]) + if sum_doc_selected_parts: + return sum_doc_selected_parts["file_size_bytes"] + except IndexError: + continue + + return None + @staticmethod def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]: """Take in a row from the attachment table and return the pacer_doc_id diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 424caedca..598e1ed2d 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -598,6 +598,34 @@ def reverse_goDLS_function(s): return parts +def reverse_sumDocSelected_function(s): + """Extract the arguments from the sumDocSelected JavaScript function. + + In: sumDocSelected(this,1,13481, 7548050) + Out: { + 'page_count': 1, + 'file_size_bytes': 13481, + 'doc_id': 7548050 + } + + The key names correspond to the form field names in the JavaScript on PACER: + + - page_count: Number of pages in the document. + - file_size_bytes: Size of the file in bytes. + - doc_id: document ID without court prefix, sometimes called dlsid. + """ + match = re.search(r"sumDocSelected\((.*?)\)", s) + args = [arg.strip() for arg in match.group(1).split(',')] + if args[0] != "this": + return None + parts = { + "page_count": int(args[1]), + "file_size_bytes": int(args[2]), + "doc_id": int(args[3]), + } + return parts + + def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page): """Make a doc1 URL. diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json index 942701062..60b8d1892 100644 --- a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json @@ -3,14 +3,16 @@ { "attachment_number": 1, "description": "Main Document", + "file_size_bytes": 13481, "pacer_doc_id": "00107548050", - "page_count": 3 + "page_count": 1 }, { "attachment_number": 2, "description": "", + "file_size_bytes": 8890, "pacer_doc_id": "00107548051", - "page_count": 5 + "page_count": 1 } ], "pacer_case_id": "46307", diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html new file mode 100644 index 000000000..fbc779b90 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html @@ -0,0 +1,139 @@ + +Document + + + +
2 Documents are attached to this filing

+ + +
+ + + + + +
Document DescriptionPagesSize
1Open document Main Document113.17 KB
2Open document 18.68 KB
+
+
+
+Selected Pages: + +  Selected Size: + + + +
Totals reflect accessible documents only and do not include unauthorized restricted documents. +

+
Include Page Numbers
+ + + + + + +
+
+
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json new file mode 100644 index 000000000..60b8d1892 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json @@ -0,0 +1,21 @@ +{ + "attachments": [ + { + "attachment_number": 1, + "description": "Main Document", + "file_size_bytes": 13481, + "pacer_doc_id": "00107548050", + "page_count": 1 + }, + { + "attachment_number": 2, + "description": "", + "file_size_bytes": 8890, + "pacer_doc_id": "00107548051", + "page_count": 1 + } + ], + "pacer_case_id": "46307", + "pacer_doc_id": "00107548050", + "pacer_seq_no": "6315334" +} \ No newline at end of file diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json index fdee7eaa8..212d342e4 100644 --- a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json @@ -3,18 +3,21 @@ { "attachment_number": 1, "description": "Motion Filed on Behalf of Party", + "file_size_bytes": 10293, "pacer_doc_id": "00506485029", "page_count": 4 }, { "attachment_number": 2, "description": "Appellant Brief", + "file_size_bytes": 113319, "pacer_doc_id": "00506485030", "page_count": 30 }, { "attachment_number": 3, "description": "Record Excerpts", + "file_size_bytes": 2155547, "pacer_doc_id": "00506485031", "page_count": 43 } From 2b322a0f82b40b2bdb1d6ab9dead0a6066ec405a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 Nov 2024 20:57:14 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- juriscraper/lib/html_utils.py | 9 ++++++++- juriscraper/pacer/appellate_attachment_page.py | 14 ++++++++++---- juriscraper/pacer/utils.py | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py index b99a68d02..d938ada71 100644 --- a/juriscraper/lib/html_utils.py +++ b/juriscraper/lib/html_utils.py @@ -29,7 +29,14 @@ ALLOWED_ATTRIBUTES["div"] = {"class", "id"} ALLOWED_ATTRIBUTES["font"] = {"face", "size"} ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"} -ALLOWED_ATTRIBUTES["input"] = {"class", "id", "name", "value", "type", "onclick"} +ALLOWED_ATTRIBUTES["input"] = { + "class", + "id", + "name", + "value", + "type", + "onclick", +} ALLOWED_ATTRIBUTES["span"] = {"class"} ALLOWED_ATTRIBUTES["table"].update({"border", "class"}) ALLOWED_ATTRIBUTES["tr"].add("class") diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py index f47782efa..0e46cebd9 100644 --- a/juriscraper/pacer/appellate_attachment_page.py +++ b/juriscraper/pacer/appellate_attachment_page.py @@ -7,8 +7,10 @@ from juriscraper.lib.html_utils import strip_bad_html_tags_insecure from juriscraper.lib.string_utils import force_unicode -from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url, \ - reverse_sumDocSelected_function +from juriscraper.pacer.utils import ( + get_pacer_doc_id_from_doc1_url, + reverse_sumDocSelected_function, +) from ..lib.log_tools import make_default_logger from .reports import BaseReport @@ -167,7 +169,9 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]: try: onclick = input_el.xpath("./@onclick") if onclick and "sumDocSelected" in onclick[0]: - sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0]) + sum_doc_selected_parts = reverse_sumDocSelected_function( + onclick[0] + ) if sum_doc_selected_parts: return sum_doc_selected_parts["page_count"] except IndexError: @@ -188,7 +192,9 @@ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]: try: onclick = input_el.xpath("./@onclick") if onclick and "sumDocSelected" in onclick[0]: - sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0]) + sum_doc_selected_parts = reverse_sumDocSelected_function( + onclick[0] + ) if sum_doc_selected_parts: return sum_doc_selected_parts["file_size_bytes"] except IndexError: diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 598e1ed2d..4829c41bb 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -615,7 +615,7 @@ def reverse_sumDocSelected_function(s): - doc_id: document ID without court prefix, sometimes called dlsid. """ match = re.search(r"sumDocSelected\((.*?)\)", s) - args = [arg.strip() for arg in match.group(1).split(',')] + args = [arg.strip() for arg in match.group(1).split(",")] if args[0] != "this": return None parts = { From 00f02b28cab15085147bcd7a1b3641304ca6634e Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 10 Dec 2024 15:39:13 -0600 Subject: [PATCH 3/3] fix(pacer): Introduced parse_sumDocSelected_from_row --- .../pacer/appellate_attachment_page.py | 36 ++--- juriscraper/pacer/utils.py | 21 ++- .../ca5_221848.html | 138 ++++++++++++++++++ .../ca5_221848.json | 21 +++ 4 files changed, 189 insertions(+), 27 deletions(-) create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca5_221848.html create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca5_221848.json diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py index 0e46cebd9..8cae78c5a 100644 --- a/juriscraper/pacer/appellate_attachment_page.py +++ b/juriscraper/pacer/appellate_attachment_page.py @@ -9,7 +9,7 @@ from juriscraper.lib.string_utils import force_unicode from juriscraper.pacer.utils import ( get_pacer_doc_id_from_doc1_url, - reverse_sumDocSelected_function, + parse_sumDocSelected_from_row, ) from ..lib.log_tools import make_default_logger @@ -164,18 +164,9 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]: :param row: Table row as an lxml element :return: Attachment page count """ - input_els = row.xpath(".//input[@class='selDocCl']") - for input_el in input_els: - try: - onclick = input_el.xpath("./@onclick") - if onclick and "sumDocSelected" in onclick[0]: - sum_doc_selected_parts = reverse_sumDocSelected_function( - onclick[0] - ) - if sum_doc_selected_parts: - return sum_doc_selected_parts["page_count"] - except IndexError: - continue + sum_doc_selected_parts = parse_sumDocSelected_from_row(row) + if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts: + return sum_doc_selected_parts["page_count"] description_text_nodes = row.xpath(".//td/text()") if not description_text_nodes: @@ -187,19 +178,12 @@ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]: """Take a row from the attachment table and return the number of bytes as an int. """ - input_els = row.xpath(".//input[@class='selDocCl']") - for input_el in input_els: - try: - onclick = input_el.xpath("./@onclick") - if onclick and "sumDocSelected" in onclick[0]: - sum_doc_selected_parts = reverse_sumDocSelected_function( - onclick[0] - ) - if sum_doc_selected_parts: - return sum_doc_selected_parts["file_size_bytes"] - except IndexError: - continue - + sum_doc_selected_parts = parse_sumDocSelected_from_row(row) + if ( + sum_doc_selected_parts + and "file_size_bytes" in sum_doc_selected_parts + ): + return sum_doc_selected_parts["file_size_bytes"] return None @staticmethod diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py index 4829c41bb..a4f2c1d03 100644 --- a/juriscraper/pacer/utils.py +++ b/juriscraper/pacer/utils.py @@ -598,7 +598,7 @@ def reverse_goDLS_function(s): return parts -def reverse_sumDocSelected_function(s): +def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]: """Extract the arguments from the sumDocSelected JavaScript function. In: sumDocSelected(this,1,13481, 7548050) @@ -834,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime: # Raise an exception if a timezone abbreviation is not specified. raise NotImplementedError(f"Datetime {datetime_str} not understood.") return date_time + + +def parse_sumDocSelected_from_row( + row: html.HtmlElement, +) -> Optional[Dict[str, int]]: + """Parse the arguments from the sumDocSelected function call parts from a + given table row. + + :param row: Table row as an HtmlElement + :return: A dictionary of parsed parameters from the sumDocSelected function, + or None if the row does not contain such data. + """ + + input_els = row.xpath(".//input[@class='selDocCl']") + for input_el in input_els: + onclick = input_el.xpath("./@onclick") + if onclick and "sumDocSelected" in onclick[0]: + return reverse_sumDocSelected_function(onclick[0]) + return None diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html new file mode 100644 index 000000000..9a53e9c4c --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html @@ -0,0 +1,138 @@ +Document + + + +
2 Documents are attached to this filing

+ + +
+ + + + + +
Document DescriptionPagesSize
1Open document Published Opinion13129.46 KB
2Open form OPJDT-2 Letter266.85 KB
+
+
+
+Selected Pages: + +  Selected Size: + + + +
Totals reflect accessible documents only and do not include unauthorized restricted documents. +

+
Include Page Numbers
+ + + + + + +
+
+
\ No newline at end of file diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json new file mode 100644 index 000000000..163da0fc5 --- /dev/null +++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json @@ -0,0 +1,21 @@ +{ + "attachments": [ + { + "attachment_number": 1, + "description": "Published Opinion", + "file_size_bytes": 132568, + "pacer_doc_id": "00507148063", + "page_count": 13 + }, + { + "attachment_number": 2, + "description": "OPJDT-2 Letter", + "file_size_bytes": 68451, + "pacer_doc_id": "00507148074", + "page_count": 2 + } + ], + "pacer_case_id": "210055", + "pacer_doc_id": "00507148063", + "pacer_seq_no": "10348292" +} \ No newline at end of file