Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mass): Use different endpoint for Mass #1243

Merged
merged 4 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 41 additions & 33 deletions juriscraper/opinions/united_states/state/mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,52 +15,60 @@
- 2023-01-28, William Palin: Updated scraper
"""

import re
from lxml import etree, html

from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
"""
Backscraper is implemented on `united_states_backscrapers.state.mass.py`
"""

court_identifier = "SJC"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.mass.gov/info-details/new-opinions"
self.url = "https://www.socialaw.com/customapi/slips/getopinions"
self.court_id = self.__module__
self.court_identifier = "SJC"
self.request["headers"] = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
}
self.needs_special_headers = True
self.court_name = "Supreme Judicial Court"
self.status = "Published"

flooie marked this conversation as resolved.
Show resolved Hide resolved
def _process_html(self):
for row in self.html.xpath(".//a/@href[contains(.,'download')]/.."):
url = row.get("href")
content = row.text_content()
m = re.search(r"(.*?) \((.*?)\)( \((.*?)\))?", content)
if not m:
continue
name, docket, _, date = m.groups()
if self.court_identifier not in docket:
"""Scrape and process the JSON endpoint

:return: None
"""
for row in self.html:
if row["SectionName"] != self.court_name:
continue
if date == None:
# Likely a new case opinion - check the header text above it
if row.xpath(".//../../h3/text()"):
header_text = row.xpath(".//../../h3/text()")[0]
date = header_text.split("Decisions:")[1].strip()
if not date:
# if no date is found skip it
continue
self.cases.append(
{
"name": name,
"status": "Published",
"date": date,
"docket": docket,
"url": url,
"name": row.get("Parties"),
flooie marked this conversation as resolved.
Show resolved Hide resolved
"judge": (
row["Details"]["Present"]
if "JJ" in row["Details"]["Present"]
flooie marked this conversation as resolved.
Show resolved Hide resolved
else ""
),
"date": row["Date"],
# "headnotes": row['Details']['Keywords'],
flooie marked this conversation as resolved.
Show resolved Hide resolved
"summary": row["Details"]["ShortOpinion"],
"url": f"https://www.socialaw.com/services/slip-opinions/{row['UrlName']}",
flooie marked this conversation as resolved.
Show resolved Hide resolved
"docket": row["Details"]["Docket"],
}
)

@staticmethod
def cleanup_content(content):
"""Remove non-opinion HTML

Cleanup HMTL from Social Law page so we can properly display the content

:param content: The scraped HTML
:return: Cleaner HTML
"""
content = content.decode("utf-8")
tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
content = tree.xpath(
"//div[@id='contentPlaceholder_ctl00_ctl00_ctl00_detailContainer']"
)[0]
new_tree = etree.Element("html")
body = etree.SubElement(new_tree, "body")
body.append(content)
return html.tostring(new_tree, pretty_print=True, encoding="unicode")
2 changes: 1 addition & 1 deletion juriscraper/opinions/united_states/state/massappct.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ class Site(mass.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.court_identifier = "AC"
self.court_name = "Appeals Court"
440 changes: 417 additions & 23 deletions tests/examples/opinions/united_states/mass_example.compare.json

Large diffs are not rendered by default.

14,383 changes: 0 additions & 14,383 deletions tests/examples/opinions/united_states/mass_example.html

This file was deleted.

1,682 changes: 1,682 additions & 0 deletions tests/examples/opinions/united_states/mass_example.json

Large diffs are not rendered by default.

382 changes: 353 additions & 29 deletions tests/examples/opinions/united_states/massappct_example.compare.json

Large diffs are not rendered by default.

Loading
Loading