Skip to content

Commit

Permalink
Merge pull request #1243 from freelawproject/mass-social-scraper
Browse files Browse the repository at this point in the history
feat(mass): Use different endpoint for Mass
  • Loading branch information
grossir authored Nov 19, 2024
2 parents c9a2b29 + c18f917 commit 2e51b45
Show file tree
Hide file tree
Showing 13 changed files with 4,125 additions and 19,461 deletions.
1 change: 1 addition & 0 deletions juriscraper/lib/string_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,7 @@ def bad_words(self):
"smith",
"johnson",
"commissioner",
"commonwealth",
]

ags = [
Expand Down
82 changes: 51 additions & 31 deletions juriscraper/opinions/united_states/state/mass.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,51 +16,71 @@
"""

import re
from urllib.parse import urljoin

from lxml import etree, html

from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
"""
Backscraper is implemented on `united_states_backscrapers.state.mass.py`
"""

court_identifier = "SJC"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://www.mass.gov/info-details/new-opinions"
self.url = "https://www.socialaw.com/customapi/slips/getopinions"
self.court_id = self.__module__
self.court_identifier = "SJC"
self.request["headers"] = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
}
self.needs_special_headers = True
self.court_name = "Supreme Judicial Court"
self.status = "Published"
self.expected_content_types = ["text/html"]

def _process_html(self):
for row in self.html.xpath(".//a/@href[contains(.,'download')]/.."):
url = row.get("href")
content = row.text_content()
m = re.search(r"(.*?) \((.*?)\)( \((.*?)\))?", content)
if not m:
continue
name, docket, _, date = m.groups()
if self.court_identifier not in docket:
"""Scrape and process the JSON endpoint
:return: None
"""
for row in self.html:
if row["SectionName"] != self.court_name:
continue
if date == None:
# Likely a new case opinion - check the header text above it
if row.xpath(".//../../h3/text()"):
header_text = row.xpath(".//../../h3/text()")[0]
date = header_text.split("Decisions:")[1].strip()
if not date:
# if no date is found skip it
continue

url = urljoin(
"https://www.socialaw.com/services/slip-opinions/",
row["UrlName"],
)
details = row["Details"]
caption = titlecase(row.get("Parties"))
caption = re.sub(r"(\[\d{1,2}\])", "", caption)

judge_str = details.get("Present", "")
judge_str = re.sub(r"(\[\d{1,2}\])", "", judge_str)
judge_str = re.sub(r"\, JJ\.", "", judge_str)

self.cases.append(
{
"name": name,
"status": "Published",
"date": date,
"docket": docket,
"name": caption,
"judge": judge_str,
"date": row["Date"],
"url": url,
"docket": details["Docket"],
}
)

@staticmethod
def cleanup_content(content):
"""Remove non-opinion HTML
Cleanup HMTL from Social Law page so we can properly display the content
:param content: The scraped HTML
:return: Cleaner HTML
"""
content = content.decode("utf-8")
tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
content = tree.xpath(
"//div[@id='contentPlaceholder_ctl00_ctl00_ctl00_detailContainer']"
)[0]
new_tree = etree.Element("html")
body = etree.SubElement(new_tree, "body")
body.append(content)
return html.tostring(new_tree, pretty_print=True, encoding="unicode")
2 changes: 1 addition & 1 deletion juriscraper/opinions/united_states/state/massappct.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ class Site(mass.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.court_identifier = "AC"
self.court_name = "Appeals Court"
Loading

0 comments on commit 2e51b45

Please sign in to comment.