fix(idaho, idahoctapp): update to OpinionSiteLinear

Solves #1278 Bug was caused by a multi line docket number; easiest way to fix this was updating to OpinionSiteLinear - simplified logic - deleted redundant example files - updated usage of `__init__` in inheriting scrapers
freelawproject · Dec 30, 2024 · aa0d423 · aa0d423
1 parent dde9d68
commit aa0d423
Show file tree

Hide file tree

Showing 12 changed files with 7,628 additions and 18,247 deletions.
diff --git a/juriscraper/opinions/united_states/state/idaho_civil.py b/juriscraper/opinions/united_states/state/idaho_civil.py
@@ -8,17 +8,13 @@
  - 2015-10-20, mlr: Updated due to new page in use.
  - 2015-10-23, mlr: Updated to handle annoying situation.
  - 2016-02-25 arderyp: Updated to catch "ORDER" (in addition to "Order") in download url text
+ - 2024-12-30, grossir: updated to OpinionSiteLinear
 """
 
-from lxml import html
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-from juriscraper.lib.string_utils import clean_if_py3, convert_date_string
-from juriscraper.OpinionSite import OpinionSite
 
-
-class Site(OpinionSite):
-    # Skip first row of table, it's a header
-    path_table_row_start = "//table//tr[position() > 1]"
+class Site(OpinionSiteLinear):
     # Skip rows that don't have  link in 4th cell with
     # either 'Opinion', 'Order', 'ORDER', or 'Amend' in
     # the link text
@@ -30,66 +26,44 @@ class Site(OpinionSite):
         'contains(.//text(), "Amended")'
         "]"
     )
-    path_conditional_row = f"/td[4]//{path_conditional_anchor}"
-    path_base = f"{path_table_row_start}[./{path_conditional_row}]"
+
+    # https://www.isc.idaho.gov/appeals-court/isc_civil
+    base_url = "https://www.isc.idaho.gov/appeals-court/"
+    url_part = "isc_civil"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.url = "https://www.isc.idaho.gov/appeals-court/isc_civil"
+        self.url = f"{self.base_url}{self.url_part}"
         self.court_id = self.__module__
+        self.status = "Published"
 
-    def _get_case_names(self):
-        case_names = []
-        path = f"{self.path_base}/td[3]"
-        for cell in self.html.xpath(path):
-            name_string = html.tostring(
-                cell, method="text", encoding="unicode"
+    def _process_html(self):
+        row_xpath = f"//table//tr[.//{self.path_conditional_anchor}]"
+        for row in self.html.xpath(row_xpath):
+            url = self.get_opinion_url(row).replace("http://", "https://")
+            self.cases.append(
+                {
+                    "date": row.xpath("string(td[1])").strip(),
+                    "docket": row.xpath("string(td[2])").strip(),
+                    "name": row.xpath("string(td[3])").strip(),
+                    "url": url,
+                }
             )
-            name_string = clean_if_py3(name_string).strip()
-            if name_string:
-                case_names.append(name_string)
-        return case_names
 
-    def _get_download_urls(self):
-        # We'll accept an order document if the opinion document
-        # is missing. But we obviously prefer the opinion doc,
-        # as indicated in the algorithm below. Since each row
-        # can list multiple valid links, we will parse all
-        # acceptable links, take the opinion link if present,
-        # otherwise take the first acceptable link.
-        opinion_urls = []
-        path = f"{self.path_base}/td[4]"
-        path_link = f".//{self.path_conditional_anchor}"
-        for cell in self.html.xpath(path):
-            urls = []
-            url_opinion = False
-            for link in cell.xpath(path_link):
-                text = link.text_content().strip()
-                url = link.attrib["href"].replace("http://", "https://")
-                urls.append(url)
-                if "Opinion" in text:
-                    url_opinion = url
-            opinion_urls.append(url_opinion if url_opinion else urls[0])
-        return opinion_urls
+    def get_opinion_url(self, row) -> str:
+        """Get's the URL tagged as an Opinion, if possible
 
-    def _get_case_dates(self):
-        case_dates = []
-        path = f"{self.path_base}/td[1]"
-        for cell in self.html.xpath(path):
-            date_string = html.tostring(
-                cell, method="text", encoding="unicode"
-            )
-            date_string = clean_if_py3(date_string).strip()
-            if date_string:
-                date_string = date_string.replace(
-                    "Sept ", "Sep "
-                )  # GIGO!  (+1 by arderyp)
-                case_dates.append(convert_date_string(date_string))
-        return case_dates
+        We'll accept an order document if the opinion document
+        is missing. Since each row can list multiple valid links,
+        we will parse all acceptable links, take the opinion link
+        if present, otherwise take the first acceptable link.
+
+        :param row: the lxml object of the row
+        :return: the document URL
+        """
 
-    def _get_docket_numbers(self):
-        path = f"{self.path_base}/td[2]//text()"
-        return [text.strip() for text in self.html.xpath(path) if text.strip()]
+        for link in row.xpath("td[4]//a"):
+            if "Opinion" in link.text_content().strip():
+                return link.xpath("@href")[0]
 
-    def _get_precedential_statuses(self):
-        return ["Published"] * len(self.case_names)
+        return row.xpath("td[4]//a/@href")[0]
diff --git a/juriscraper/opinions/united_states/state/idaho_criminal.py b/juriscraper/opinions/united_states/state/idaho_criminal.py
@@ -2,7 +2,5 @@
 
 
 class Site(idaho_civil.Site):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.url = "https://www.isc.idaho.gov/appeals-court/isc_criminal"
-        self.court_id = self.__module__
+    # https://www.isc.idaho.gov/appeals-court/isc_criminal
+    url_part = "isc_criminal"
diff --git a/juriscraper/opinions/united_states/state/idahoctapp_civil.py b/juriscraper/opinions/united_states/state/idahoctapp_civil.py
@@ -2,7 +2,5 @@
 
 
 class Site(idaho_civil.Site):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.url = "https://www.isc.idaho.gov/appeals-court/coa_civil"
-        self.court_id = self.__module__
+    # https://www.isc.idaho.gov/appeals-court/coa_civil
+    url_part = "coa_civil"
diff --git a/juriscraper/opinions/united_states/state/idahoctapp_criminal.py b/juriscraper/opinions/united_states/state/idahoctapp_criminal.py
@@ -2,7 +2,5 @@
 
 
 class Site(idaho_civil.Site):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.url = "https://www.isc.idaho.gov/appeals-court/coa_criminal"
-        self.court_id = self.__module__
+    # https://www.isc.idaho.gov/appeals-court/coa_criminal
+    url_part = "coa_criminal"