Merge pull request #1243 from freelawproject/mass-social-scraper

feat(mass): Use different endpoint for Mass
freelawproject · Nov 19, 2024 · 2e51b45 · 2e51b45
2 parents c9a2b29 + c18f917
commit 2e51b45
Show file tree

Hide file tree

Showing 13 changed files with 4,125 additions and 19,461 deletions.
diff --git a/juriscraper/lib/string_utils.py b/juriscraper/lib/string_utils.py
@@ -608,6 +608,7 @@ def bad_words(self):
             "smith",
             "johnson",
             "commissioner",
+            "commonwealth",
         ]
 
         ags = [

diff --git a/juriscraper/opinions/united_states/state/mass.py b/juriscraper/opinions/united_states/state/mass.py
@@ -16,51 +16,71 @@
 """
 
 import re
+from urllib.parse import urljoin
 
+from lxml import etree, html
+
+from juriscraper.lib.html_utils import strip_bad_html_tags_insecure
+from juriscraper.lib.string_utils import titlecase
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
 class Site(OpinionSiteLinear):
-    """
-    Backscraper is implemented on `united_states_backscrapers.state.mass.py`
-    """
-
-    court_identifier = "SJC"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "https://www.mass.gov/info-details/new-opinions"
+        self.url = "https://www.socialaw.com/customapi/slips/getopinions"
         self.court_id = self.__module__
-        self.court_identifier = "SJC"
-        self.request["headers"] = {
-            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15",
-        }
-        self.needs_special_headers = True
+        self.court_name = "Supreme Judicial Court"
+        self.status = "Published"
+        self.expected_content_types = ["text/html"]
 
     def _process_html(self):
-        for row in self.html.xpath(".//a/@href[contains(.,'download')]/.."):
-            url = row.get("href")
-            content = row.text_content()
-            m = re.search(r"(.*?) \((.*?)\)( \((.*?)\))?", content)
-            if not m:
-                continue
-            name, docket, _, date = m.groups()
-            if self.court_identifier not in docket:
+        """Scrape and process the JSON endpoint
+
+        :return: None
+        """
+        for row in self.html:
+            if row["SectionName"] != self.court_name:
                 continue
-            if date == None:
-                # Likely a new case opinion - check the header text above it
-                if row.xpath(".//../../h3/text()"):
-                    header_text = row.xpath(".//../../h3/text()")[0]
-                    date = header_text.split("Decisions:")[1].strip()
-                if not date:
-                    # if no date is found skip it
-                    continue
+
+            url = urljoin(
+                "https://www.socialaw.com/services/slip-opinions/",
+                row["UrlName"],
+            )
+            details = row["Details"]
+            caption = titlecase(row.get("Parties"))
+            caption = re.sub(r"(\[\d{1,2}\])", "", caption)
+
+            judge_str = details.get("Present", "")
+            judge_str = re.sub(r"(\[\d{1,2}\])", "", judge_str)
+            judge_str = re.sub(r"\, JJ\.", "", judge_str)
+
             self.cases.append(
                 {
-                    "name": name,
-                    "status": "Published",
-                    "date": date,
-                    "docket": docket,
+                    "name": caption,
+                    "judge": judge_str,
+                    "date": row["Date"],
                     "url": url,
+                    "docket": details["Docket"],
                 }
             )
+
+    @staticmethod
+    def cleanup_content(content):
+        """Remove non-opinion HTML
+
+        Cleanup HMTL from Social Law page so we can properly display the content
+
+        :param content: The scraped HTML
+        :return: Cleaner HTML
+        """
+        content = content.decode("utf-8")
+        tree = strip_bad_html_tags_insecure(content, remove_scripts=True)
+        content = tree.xpath(
+            "//div[@id='contentPlaceholder_ctl00_ctl00_ctl00_detailContainer']"
+        )[0]
+        new_tree = etree.Element("html")
+        body = etree.SubElement(new_tree, "body")
+        body.append(content)
+        return html.tostring(new_tree, pretty_print=True, encoding="unicode")
diff --git a/juriscraper/opinions/united_states/state/massappct.py b/juriscraper/opinions/united_states/state/massappct.py
@@ -17,4 +17,4 @@ class Site(mass.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
-        self.court_identifier = "AC"
+        self.court_name = "Appeals Court"