From 15def3f8755be46b58526bff25cfbd7f25b31e4b Mon Sep 17 00:00:00 2001
From: ttys0dev <126845556+ttys0dev@users.noreply.github.com>
Date: Sat, 16 Nov 2024 22:54:01 +0200
Subject: [PATCH 1/3] Handle new appellate attachment page format
---
juriscraper/lib/html_utils.py | 2 +-
.../pacer/appellate_attachment_page.py | 55 +++++--
juriscraper/pacer/utils.py | 28 ++++
.../appellate_attachment_pages/ca1_46307.json | 6 +-
.../ca1_46307_new.html | 139 ++++++++++++++++++
.../ca1_46307_new.json | 21 +++
.../ca5_22-30311.json | 3 +
7 files changed, 241 insertions(+), 13 deletions(-)
create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py
index 766da1693..b99a68d02 100644
--- a/juriscraper/lib/html_utils.py
+++ b/juriscraper/lib/html_utils.py
@@ -29,7 +29,7 @@
ALLOWED_ATTRIBUTES["div"] = {"class", "id"}
ALLOWED_ATTRIBUTES["font"] = {"face", "size"}
ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"}
-ALLOWED_ATTRIBUTES["input"] = {"id", "name", "value", "type", "onclick"}
+ALLOWED_ATTRIBUTES["input"] = {"class", "id", "name", "value", "type", "onclick"}
ALLOWED_ATTRIBUTES["span"] = {"class"}
ALLOWED_ATTRIBUTES["table"].update({"border", "class"})
ALLOWED_ATTRIBUTES["tr"].add("class")
diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py
index bb25cbbf1..f47782efa 100644
--- a/juriscraper/pacer/appellate_attachment_page.py
+++ b/juriscraper/pacer/appellate_attachment_page.py
@@ -7,7 +7,8 @@
from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import force_unicode
-from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url
+from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url, \
+ reverse_sumDocSelected_function
from ..lib.log_tools import make_default_logger
from .reports import BaseReport
@@ -97,14 +98,16 @@ def data(self) -> Dict:
}
for row in rows:
- result["attachments"].append(
- {
- "attachment_number": self._get_attachment_number(row),
- "description": self._get_description_from_tr(row),
- "page_count": self._get_page_count_from_tr(row),
- "pacer_doc_id": self._get_pacer_doc_id(row),
- }
- )
+ attachment = {
+ "attachment_number": self._get_attachment_number(row),
+ "description": self._get_description_from_tr(row),
+ "page_count": self._get_page_count_from_tr(row),
+ "pacer_doc_id": self._get_pacer_doc_id(row),
+ }
+ file_size_bytes = self._get_file_size_bytes_from_tr(row)
+ if file_size_bytes is not None:
+ attachment["file_size_bytes"] = file_size_bytes
+ result["attachments"].append(attachment)
return result
def _get_main_pacer_doc_id(self):
@@ -143,7 +146,10 @@ def _get_description_from_tr(self, row: html.HtmlElement) -> str:
row_nodes = row.xpath(".//td")
if not row_nodes:
return ""
- description = row_nodes[-2].xpath("text()")
+ desc_idx = -2
+ if len(row_nodes) == 6:
+ desc_idx = -3
+ description = row_nodes[desc_idx].xpath("text()")
if description:
return force_unicode(description[0].strip())
return ""
@@ -156,11 +162,40 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
:param row: Table row as an lxml element
:return: Attachment page count
"""
+ input_els = row.xpath(".//input[@class='selDocCl']")
+ for input_el in input_els:
+ try:
+ onclick = input_el.xpath("./@onclick")
+ if onclick and "sumDocSelected" in onclick[0]:
+ sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0])
+ if sum_doc_selected_parts:
+ return sum_doc_selected_parts["page_count"]
+ except IndexError:
+ continue
+
description_text_nodes = row.xpath(".//td/text()")
if not description_text_nodes:
return None
return int(description_text_nodes[-1].strip())
+ @staticmethod
+ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
+ """Take a row from the attachment table and return the number of bytes
+ as an int.
+ """
+ input_els = row.xpath(".//input[@class='selDocCl']")
+ for input_el in input_els:
+ try:
+ onclick = input_el.xpath("./@onclick")
+ if onclick and "sumDocSelected" in onclick[0]:
+ sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0])
+ if sum_doc_selected_parts:
+ return sum_doc_selected_parts["file_size_bytes"]
+ except IndexError:
+ continue
+
+ return None
+
@staticmethod
def _get_pacer_doc_id(row: html.HtmlElement) -> Optional[str]:
"""Take in a row from the attachment table and return the pacer_doc_id
diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
index 424caedca..598e1ed2d 100644
--- a/juriscraper/pacer/utils.py
+++ b/juriscraper/pacer/utils.py
@@ -598,6 +598,34 @@ def reverse_goDLS_function(s):
return parts
+def reverse_sumDocSelected_function(s):
+ """Extract the arguments from the sumDocSelected JavaScript function.
+
+ In: sumDocSelected(this,1,13481, 7548050)
+ Out: {
+ 'page_count': 1,
+ 'file_size_bytes': 13481,
+ 'doc_id': 7548050
+ }
+
+ The key names correspond to the form field names in the JavaScript on PACER:
+
+ - page_count: Number of pages in the document.
+ - file_size_bytes: Size of the file in bytes.
+ - doc_id: document ID without court prefix, sometimes called dlsid.
+ """
+ match = re.search(r"sumDocSelected\((.*?)\)", s)
+ args = [arg.strip() for arg in match.group(1).split(',')]
+ if args[0] != "this":
+ return None
+ parts = {
+ "page_count": int(args[1]),
+ "file_size_bytes": int(args[2]),
+ "doc_id": int(args[3]),
+ }
+ return parts
+
+
def make_doc1_url(court_id, pacer_doc_id, skip_attachment_page):
"""Make a doc1 URL.
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
index 942701062..60b8d1892 100644
--- a/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307.json
@@ -3,14 +3,16 @@
{
"attachment_number": 1,
"description": "Main Document",
+ "file_size_bytes": 13481,
"pacer_doc_id": "00107548050",
- "page_count": 3
+ "page_count": 1
},
{
"attachment_number": 2,
"description": "",
+ "file_size_bytes": 8890,
"pacer_doc_id": "00107548051",
- "page_count": 5
+ "page_count": 1
}
],
"pacer_case_id": "46307",
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
new file mode 100644
index 000000000..fbc779b90
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.html
@@ -0,0 +1,139 @@
+
+
Document
+
+
+2 Documents are attached to this filing
|
+
+
+
+
+Selected Pages:
+
+ Selected Size:
+
+
+
+
Totals reflect accessible documents only and do not include unauthorized restricted documents.
+
+
+
Selected documents cannot be combined due to size. Please remove some selections to be below 250 MB.
+
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
new file mode 100644
index 000000000..60b8d1892
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca1_46307_new.json
@@ -0,0 +1,21 @@
+{
+ "attachments": [
+ {
+ "attachment_number": 1,
+ "description": "Main Document",
+ "file_size_bytes": 13481,
+ "pacer_doc_id": "00107548050",
+ "page_count": 1
+ },
+ {
+ "attachment_number": 2,
+ "description": "",
+ "file_size_bytes": 8890,
+ "pacer_doc_id": "00107548051",
+ "page_count": 1
+ }
+ ],
+ "pacer_case_id": "46307",
+ "pacer_doc_id": "00107548050",
+ "pacer_seq_no": "6315334"
+}
\ No newline at end of file
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
index fdee7eaa8..212d342e4 100644
--- a/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_22-30311.json
@@ -3,18 +3,21 @@
{
"attachment_number": 1,
"description": "Motion Filed on Behalf of Party",
+ "file_size_bytes": 10293,
"pacer_doc_id": "00506485029",
"page_count": 4
},
{
"attachment_number": 2,
"description": "Appellant Brief",
+ "file_size_bytes": 113319,
"pacer_doc_id": "00506485030",
"page_count": 30
},
{
"attachment_number": 3,
"description": "Record Excerpts",
+ "file_size_bytes": 2155547,
"pacer_doc_id": "00506485031",
"page_count": 43
}
From 2b322a0f82b40b2bdb1d6ab9dead0a6066ec405a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
<66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 16 Nov 2024 20:57:14 +0000
Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
---
juriscraper/lib/html_utils.py | 9 ++++++++-
juriscraper/pacer/appellate_attachment_page.py | 14 ++++++++++----
juriscraper/pacer/utils.py | 2 +-
3 files changed, 19 insertions(+), 6 deletions(-)
diff --git a/juriscraper/lib/html_utils.py b/juriscraper/lib/html_utils.py
index b99a68d02..d938ada71 100644
--- a/juriscraper/lib/html_utils.py
+++ b/juriscraper/lib/html_utils.py
@@ -29,7 +29,14 @@
ALLOWED_ATTRIBUTES["div"] = {"class", "id"}
ALLOWED_ATTRIBUTES["font"] = {"face", "size"}
ALLOWED_ATTRIBUTES["form"] = {"name", "method", "action"}
-ALLOWED_ATTRIBUTES["input"] = {"class", "id", "name", "value", "type", "onclick"}
+ALLOWED_ATTRIBUTES["input"] = {
+ "class",
+ "id",
+ "name",
+ "value",
+ "type",
+ "onclick",
+}
ALLOWED_ATTRIBUTES["span"] = {"class"}
ALLOWED_ATTRIBUTES["table"].update({"border", "class"})
ALLOWED_ATTRIBUTES["tr"].add("class")
diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py
index f47782efa..0e46cebd9 100644
--- a/juriscraper/pacer/appellate_attachment_page.py
+++ b/juriscraper/pacer/appellate_attachment_page.py
@@ -7,8 +7,10 @@
from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import force_unicode
-from juriscraper.pacer.utils import get_pacer_doc_id_from_doc1_url, \
- reverse_sumDocSelected_function
+from juriscraper.pacer.utils import (
+ get_pacer_doc_id_from_doc1_url,
+ reverse_sumDocSelected_function,
+)
from ..lib.log_tools import make_default_logger
from .reports import BaseReport
@@ -167,7 +169,9 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
try:
onclick = input_el.xpath("./@onclick")
if onclick and "sumDocSelected" in onclick[0]:
- sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0])
+ sum_doc_selected_parts = reverse_sumDocSelected_function(
+ onclick[0]
+ )
if sum_doc_selected_parts:
return sum_doc_selected_parts["page_count"]
except IndexError:
@@ -188,7 +192,9 @@ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
try:
onclick = input_el.xpath("./@onclick")
if onclick and "sumDocSelected" in onclick[0]:
- sum_doc_selected_parts = reverse_sumDocSelected_function(onclick[0])
+ sum_doc_selected_parts = reverse_sumDocSelected_function(
+ onclick[0]
+ )
if sum_doc_selected_parts:
return sum_doc_selected_parts["file_size_bytes"]
except IndexError:
diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
index 598e1ed2d..4829c41bb 100644
--- a/juriscraper/pacer/utils.py
+++ b/juriscraper/pacer/utils.py
@@ -615,7 +615,7 @@ def reverse_sumDocSelected_function(s):
- doc_id: document ID without court prefix, sometimes called dlsid.
"""
match = re.search(r"sumDocSelected\((.*?)\)", s)
- args = [arg.strip() for arg in match.group(1).split(',')]
+ args = [arg.strip() for arg in match.group(1).split(",")]
if args[0] != "this":
return None
parts = {
From 00f02b28cab15085147bcd7a1b3641304ca6634e Mon Sep 17 00:00:00 2001
From: Alberto Islas
Date: Tue, 10 Dec 2024 15:39:13 -0600
Subject: [PATCH 3/3] fix(pacer): Introduced parse_sumDocSelected_from_row
---
.../pacer/appellate_attachment_page.py | 36 ++---
juriscraper/pacer/utils.py | 21 ++-
.../ca5_221848.html | 138 ++++++++++++++++++
.../ca5_221848.json | 21 +++
4 files changed, 189 insertions(+), 27 deletions(-)
create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca5_221848.html
create mode 100644 tests/examples/pacer/appellate_attachment_pages/ca5_221848.json
diff --git a/juriscraper/pacer/appellate_attachment_page.py b/juriscraper/pacer/appellate_attachment_page.py
index 0e46cebd9..8cae78c5a 100644
--- a/juriscraper/pacer/appellate_attachment_page.py
+++ b/juriscraper/pacer/appellate_attachment_page.py
@@ -9,7 +9,7 @@
from juriscraper.lib.string_utils import force_unicode
from juriscraper.pacer.utils import (
get_pacer_doc_id_from_doc1_url,
- reverse_sumDocSelected_function,
+ parse_sumDocSelected_from_row,
)
from ..lib.log_tools import make_default_logger
@@ -164,18 +164,9 @@ def _get_page_count_from_tr(row: html.HtmlElement) -> Optional[int]:
:param row: Table row as an lxml element
:return: Attachment page count
"""
- input_els = row.xpath(".//input[@class='selDocCl']")
- for input_el in input_els:
- try:
- onclick = input_el.xpath("./@onclick")
- if onclick and "sumDocSelected" in onclick[0]:
- sum_doc_selected_parts = reverse_sumDocSelected_function(
- onclick[0]
- )
- if sum_doc_selected_parts:
- return sum_doc_selected_parts["page_count"]
- except IndexError:
- continue
+ sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
+ if sum_doc_selected_parts and "page_count" in sum_doc_selected_parts:
+ return sum_doc_selected_parts["page_count"]
description_text_nodes = row.xpath(".//td/text()")
if not description_text_nodes:
@@ -187,19 +178,12 @@ def _get_file_size_bytes_from_tr(row: html.HtmlElement) -> Optional[int]:
"""Take a row from the attachment table and return the number of bytes
as an int.
"""
- input_els = row.xpath(".//input[@class='selDocCl']")
- for input_el in input_els:
- try:
- onclick = input_el.xpath("./@onclick")
- if onclick and "sumDocSelected" in onclick[0]:
- sum_doc_selected_parts = reverse_sumDocSelected_function(
- onclick[0]
- )
- if sum_doc_selected_parts:
- return sum_doc_selected_parts["file_size_bytes"]
- except IndexError:
- continue
-
+ sum_doc_selected_parts = parse_sumDocSelected_from_row(row)
+ if (
+ sum_doc_selected_parts
+ and "file_size_bytes" in sum_doc_selected_parts
+ ):
+ return sum_doc_selected_parts["file_size_bytes"]
return None
@staticmethod
diff --git a/juriscraper/pacer/utils.py b/juriscraper/pacer/utils.py
index 4829c41bb..a4f2c1d03 100644
--- a/juriscraper/pacer/utils.py
+++ b/juriscraper/pacer/utils.py
@@ -598,7 +598,7 @@ def reverse_goDLS_function(s):
return parts
-def reverse_sumDocSelected_function(s):
+def reverse_sumDocSelected_function(s) -> Optional[Dict[str, int]]:
"""Extract the arguments from the sumDocSelected JavaScript function.
In: sumDocSelected(this,1,13481, 7548050)
@@ -834,3 +834,22 @@ def parse_datetime_for_us_timezone(datetime_str: str) -> datetime:
# Raise an exception if a timezone abbreviation is not specified.
raise NotImplementedError(f"Datetime {datetime_str} not understood.")
return date_time
+
+
+def parse_sumDocSelected_from_row(
+ row: html.HtmlElement,
+) -> Optional[Dict[str, int]]:
+ """Parse the arguments from the sumDocSelected function call parts from a
+ given table row.
+
+ :param row: Table row as an HtmlElement
+ :return: A dictionary of parsed parameters from the sumDocSelected function,
+ or None if the row does not contain such data.
+ """
+
+ input_els = row.xpath(".//input[@class='selDocCl']")
+ for input_el in input_els:
+ onclick = input_el.xpath("./@onclick")
+ if onclick and "sumDocSelected" in onclick[0]:
+ return reverse_sumDocSelected_function(onclick[0])
+ return None
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html
new file mode 100644
index 000000000..9a53e9c4c
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.html
@@ -0,0 +1,138 @@
+Document
+
+
+2 Documents are attached to this filing
|
+
+
+
+
+Selected Pages:
+
+ Selected Size:
+
+
+
+
Totals reflect accessible documents only and do not include unauthorized restricted documents.
+
+
+
Selected documents cannot be combined due to size. Please remove some selections to be below 20 MB.
+
\ No newline at end of file
diff --git a/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json
new file mode 100644
index 000000000..163da0fc5
--- /dev/null
+++ b/tests/examples/pacer/appellate_attachment_pages/ca5_221848.json
@@ -0,0 +1,21 @@
+{
+ "attachments": [
+ {
+ "attachment_number": 1,
+ "description": "Published Opinion",
+ "file_size_bytes": 132568,
+ "pacer_doc_id": "00507148063",
+ "page_count": 13
+ },
+ {
+ "attachment_number": 2,
+ "description": "OPJDT-2 Letter",
+ "file_size_bytes": 68451,
+ "pacer_doc_id": "00507148074",
+ "page_count": 2
+ }
+ ],
+ "pacer_case_id": "210055",
+ "pacer_doc_id": "00507148063",
+ "pacer_seq_no": "10348292"
+}
\ No newline at end of file