Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(extract_from_text): now returns a plain citation string #1298

Merged
merged 1 commit into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def _get_other_dates(self):
def _get_attorneys(self):
return None

def extract_from_text(self, scraped_text):
def extract_from_text(self, scraped_text: str) -> dict:
"""Pass scraped text into function and return data as a dictionary

:param opinion_text: Text of scraped content
Expand Down
5 changes: 2 additions & 3 deletions juriscraper/opinions/united_states/state/nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,12 +115,11 @@ def extract_from_text(self, scraped_text: str) -> dict:
of models field - value pairs
"""
metadata = {}
regex = r"(?P<volume>20\d{2})\s(?P<reporter>ND)\s(?P<page>\d+)"
regex = r"20\d{2}\sND\s\d+"
citation_match = re.search(regex, scraped_text[:1000])

if citation_match:
# type 8 is a neutral citation in Courtlistener
metadata["Citation"] = {**citation_match.groupdict(), "type": 8}
metadata["Citation"] = citation_match.group(0)
Comment on lines 116 to +122
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like ND doesnt need this anymore.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just saw that ND has changed; it will require further changes to clean the case name
Do you mind if I merge this and do the nd changes in a different branch?


# Most times, paragraphs are enumerated. The data we are interested
# in is in a few lines before the first paragraph
Expand Down
7 changes: 3 additions & 4 deletions juriscraper/opinions/united_states/state/nytrial.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
pattern = r"</table><br><br\s?/?>\s?(.*)\r?\n|Docket Number:\s?(.+)"
docket_number = self.match(scraped_text, pattern)

pattern = r"\[(?P<volume>\d+) (?P<reporter>Misc 3d) (?P<page>.+)\]"
cite_match = re.search(pattern, scraped_text[:2000])
regex_citation = r"(?<=\[)\d+ Misc 3d .+(?=\])"
cite_match = re.search(regex_citation, scraped_text[:2000])

# Only for .htm links
full_case = None
Expand All @@ -150,8 +150,7 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
"author_str": normalize_judge_string(judge)[0]
}
if cite_match:
metadata["Citation"] = cite_match.groupdict("")
metadata["Citation"]["type"] = 2 # 'State' type in courtlistener
metadata["Citation"] = cite_match.group(0)
if full_case:
full_case = harmonize(full_case)
metadata["Docket"]["case_name_full"] = full_case
Expand Down
8 changes: 2 additions & 6 deletions juriscraper/opinions/united_states/state/pasuperct.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,8 @@ def extract_from_text(self, scraped_text: str) -> Dict:

Not all scraped opinions have them
"""
neutral_citation_regex = (
r"(?P<volume>\d{4}) (?P<reporter>PA Super) (?P<page>\d+)"
)
neutral_citation_regex = r"\d{4} PA Super \d+"
if cite_match := re.search(neutral_citation_regex, scraped_text[:200]):
cite_data = cite_match.groupdict()
cite_data["type"] = 8 # Neutral citation
return {"Citation": cite_data}
return {"Citation": cite_match.group(0)}

return {}
7 changes: 2 additions & 5 deletions juriscraper/opinions/united_states/state/vt.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,8 @@ def set_url(
self.url = f"{self.base_url}?{urlencode(params)}"

def extract_from_text(self, scraped_text: str):
match = re.search(
r"(?P<volume>\d{4}) (?P<reporter>VT) (?P<page>\d+)",
scraped_text[:1000],
)
match = re.search(r"\d{4} VT \d+", scraped_text[:1000])
if match:
return {"Citation": {"type": 8, **match.groupdict()}}
return {"Citation": match.group(0)}

return {}
9 changes: 3 additions & 6 deletions juriscraper/opinions/united_states/state/wis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,7 @@ def __init__(self, *args, **kwargs):
self.base_url = "https://www.wicourts.gov/supreme/scopin.jsp"
self.status = "Published"
self.set_url()
self.cite_regex = (
r"(?P<volume>20\d{2})\s(?P<reporter>WI)\s(?P<page>\d+)"
)
self.cite_regex = r"20\d{2}\sWI\s\d+"
self.make_backscrape_iterable(kwargs)

def set_url(
Expand Down Expand Up @@ -73,10 +71,9 @@ def extract_from_text(self, scraped_text: str) -> dict:
:return: date filed
"""
first_line = scraped_text[:100].splitlines()[0]
match = re.search(self.cite_regex, first_line)
if match := re.search(self.cite_regex, first_line):
return {"Citation": match.group(0)}

if match:
return {"Citation": {**match.groupdict(), "type": 8}}
return {}

def _download_backwards(self, dates: Tuple[date]) -> None:
Expand Down
4 changes: 1 addition & 3 deletions juriscraper/opinions/united_states/state/wisctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ def __init__(self, *args, **kwargs):
self.court_id = self.__module__
self.base_url = "https://www.wicourts.gov/other/appeals/caopin.jsp"
self.set_url()
self.cite_regex = (
r"(?P<volume>20\d{2})\s(?P<reporter>WI App)\s(?P<page>\d+)"
)
self.cite_regex = r"20\d{2}\sWI App\s\d+"

def combine_opinions(self, url: str, docket_number: str) -> bool:
"""Combine duplicate opinions in self.cases
Expand Down
119 changes: 17 additions & 102 deletions tests/local/test_ScraperExtractFromTextTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": '1125 Morris Avenue Realty LLC, Plaintiff(s), against Title Issues Agency LLC, MARTIN E. KOFMAN, STEVEN LOWENTHAL, ESQ., and LOWENTHAL PC, "JOHN DOE," "JANE DOE," "ABC CORPORATION," AND "XYZ CORPORATION," Defendant(s).',
},
"Opinion": {"author_str": "Fidel E. Gomez"},
"Citation": {
"volume": "81",
"reporter": "Misc 3d",
"page": "1215(A)",
"type": 2,
},
"Citation": "81 Misc 3d 1215(A)",
"OpinionCluster": {
"case_name_full": '1125 Morris Avenue Realty LLC, Plaintiff(s), against Title Issues Agency LLC, MARTIN E. KOFMAN, STEVEN LOWENTHAL, ESQ., and LOWENTHAL PC, "JOHN DOE," "JANE DOE," "ABC CORPORATION," AND "XYZ CORPORATION," Defendant(s).'
},
Expand Down Expand Up @@ -225,12 +220,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "The People of the State of New York, against J.S., Adolescent Offender.",
},
"Opinion": {"author_str": "Conrad D. Singer"},
"Citation": {
"volume": "66",
"reporter": "Misc 3d",
"page": "1213(A)",
"type": 2,
},
"Citation": "66 Misc 3d 1213(A)",
"OpinionCluster": {
"case_name_full": "The People of the State of New York, against J.S., Adolescent Offender."
},
Expand Down Expand Up @@ -264,12 +254,7 @@ class ScraperExtractFromText(unittest.TestCase):
"OpinionCluster": {
"case_name_full": '201 East 164th Street Associates, LLC, against Pastora Calderon & ROSA IDALIA ABDELNOUR, "JOHN DOE" & "JANE DOE" A/K/A DUNIA GOMEZ Respondents-Undertenants.'
},
"Citation": {
"volume": "81",
"reporter": "Misc 3d",
"page": "1211(A)",
"type": 2,
},
"Citation": "81 Misc 3d 1211(A)",
},
),
],
Expand All @@ -283,12 +268,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "Probate Proceeding, Will of Pia Jeong Yoon, a/k/a PIA JEONG AE YOON, a/k/a PIA J. YOON, a/k/a JEONG YOON, a/k/a JEONG AE YOON",
},
"Opinion": {"author_str": "Peter J. Kelly"},
"Citation": {
"volume": "78",
"reporter": "Misc 3d",
"page": "1203(A)",
"type": 2,
},
"Citation": "78 Misc 3d 1203(A)",
"OpinionCluster": {
"case_name_full": "Probate Proceeding, Will of Pia Jeong Yoon, a/k/a PIA JEONG AE YOON, a/k/a PIA J. YOON, a/k/a JEONG YOON, a/k/a JEONG AE YOON"
},
Expand All @@ -313,12 +293,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "In the Matter of a Proceeding Under Article 6 of the Family Court Act Robyn C., against William M. J. (Deceased) and EVA JANE P.",
},
"Opinion": {"author_str": "Javier E. Vargas"},
"Citation": {
"volume": "66",
"reporter": "Misc 3d",
"page": "1210(A)",
"type": 2,
},
"Citation": "66 Misc 3d 1210(A)",
"OpinionCluster": {
"case_name_full": "In the Matter of a Proceeding Under Article 6 of the Family Court Act Robyn C., against William M. J. (Deceased) and EVA JANE P."
},
Expand All @@ -333,12 +308,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "In the Matter of a Proceeding for Support Under Article 4 of the Family Court Act Michelle B., against Thomas Y.",
},
"Opinion": {"author_str": "Javier E. Vargas"},
"Citation": {
"volume": "73",
"reporter": "Misc 3d",
"page": "1238(A)",
"type": 2,
},
"Citation": "73 Misc 3d 1238(A)",
"OpinionCluster": {
"case_name_full": "In the Matter of a Proceeding for Support Under Article 4 of the Family Court Act Michelle B., against Thomas Y."
},
Expand All @@ -355,12 +325,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "The People of the State of New York against Amela Hot",
},
"Opinion": {"author_str": "Donald Leo"},
"Citation": {
"volume": "58",
"reporter": "Misc 3d",
"page": "1215(A)",
"type": 2,
},
"Citation": "58 Misc 3d 1215(A)",
"OpinionCluster": {
"case_name_full": "The People of the State of New York against Amela Hot"
},
Expand All @@ -375,12 +340,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "The People of the State of New York, against James Smith",
},
"Opinion": {"author_str": "Althea E. Drysdale"},
"Citation": {
"volume": "59",
"reporter": "Misc 3d",
"page": "1211(A)",
"type": 2,
},
"Citation": "59 Misc 3d 1211(A)",
"OpinionCluster": {
"case_name_full": "The People of the State of New York, against James Smith"
},
Expand Down Expand Up @@ -408,12 +368,7 @@ class ScraperExtractFromText(unittest.TestCase):
"OpinionCluster": {
"case_name_full": "Bernardo Martinaj, against State of New York"
},
"Citation": {
"volume": "78",
"reporter": "Misc 3d",
"page": "1211(A)",
"type": 2,
},
"Citation": "78 Misc 3d 1211(A)",
},
),
],
Expand All @@ -427,12 +382,7 @@ class ScraperExtractFromText(unittest.TestCase):
"case_name_full": "Alistair Sims, against Lance Frantz Regis A/K/A LANCE REGIS A/K/A LANCE F. REGIS A/K/A FRANTZ L. REGISTRE A/K/A REGISTRE FRANTZ A/K/A VANCE REGIS A/K/A REGIS LANCE A/K/A REGIS L. FRANTZ, Defendant(s).",
},
"Opinion": {"author_str": "Michael A. Montesano"},
"Citation": {
"volume": "81",
"reporter": "Misc 3d",
"page": "1210(A)",
"type": 2,
},
"Citation": "81 Misc 3d 1210(A)",
"OpinionCluster": {
"case_name_full": "Alistair Sims, against Lance Frantz Regis A/K/A LANCE REGIS A/K/A LANCE F. REGIS A/K/A FRANTZ L. REGISTRE A/K/A REGISTRE FRANTZ A/K/A VANCE REGIS A/K/A REGIS LANCE A/K/A REGIS L. FRANTZ, Defendant(s)."
},
Expand Down Expand Up @@ -477,25 +427,15 @@ class ScraperExtractFromText(unittest.TestCase):
# https://www.courtlistener.com/api/rest/v3/opinions/10473075/
"""IN THE SUPREME COURT\n STATE OF NORTH DAKOTA\n\n 2024 ND 143\n\nRonald Wayne Wootan, Petitioner and Appellant\n v.\nState of North Dakota, Respondent and Appellee\n\n No. 20240025\n\nAppeal from the District Court of Rolette County, Northeast Judicial District,\nthe Honorable Anthony S. Benson, Judge.\n\nAFFIRMED.\n\nPer Curiam.\n\nKiara C. Kraus-Parr, Grand Forks, ND, for petitioner and appellant.\n\nBrian D. Grosinger, State’s Attorney, Rolla, ND, for respondent and appellee.\n\f Wootan v. State\n No. 20240025\n\nPer Curiam.\n\n Ronald Wootan appeals from an order denying his postconviction relief\napplication entered after the district court held an evidentiary hearing on\nremand. See Wootan v. State,""",
{
"Citation": {
"volume": "2024",
"reporter": "ND",
"page": "143",
"type": 8,
},
"Citation": "2024 ND 143",
},
),
(
# Example of a consolidated case
# https://www.courtlistener.com/api/rest/v3/opinions/10473085/
"""IN THE SUPREME COURT\n STATE OF NORTH DAKOTA\n\n 2024 ND 141\n\nRenae Irene Gerszewski, Petitioner and Appellee\n v.\nConrad Keith Rostvet, Respondent and Appellant\n\n\n\n No. 20230361\n\n\n\nConrad Keith Rostvet, Petitioner and Appellant\n v.\nRenae Irene Gerszewski, Respondent and Appellee\n\n\n\n No. 20230362\n\n\n\nConrad Rostvet, Petitioner and Appellant\n v.\nWayne Gerszewski, Respondent and Appellee\n\n\n\n No. 20230363\n\n\n\nAppeal from the District Court of Walsh County, Northeast Judicial District, the Honorable\nBarbara L. Whelan, Judge.\n\fAFFIRMED.\n\nOpinion of the Court by Tufte, Justice.\n\nSamuel A. Gereszek, Grand Forks, N.D., for appellees.\n\nTimothy C. Lamb, Grand Forks, N.D., for appellant.\n\f Gerszewski v. Rostvet\n Nos. 20230361–20230363\n\nTufte, Justice.\n\n[¶1] Conrad Rostvet appeals from a district court’s order""",
{
"Citation": {
"volume": "2024",
"reporter": "ND",
"page": "141",
"type": 8,
},
"Citation": "2024 ND 141",
"OpinionCluster": {"case_name": "Gerszewski v. Rostvet"},
"Docket": {
"case_name": "Gerszewski v. Rostvet",
Expand All @@ -509,12 +449,7 @@ class ScraperExtractFromText(unittest.TestCase):
# https://www.wicourts.gov/sc/opinion/DisplayDocument.pdf?content=pdf&seqNo=669658
"""2023 WI 50\nS C W\nUPREME OURT OF ISCONSIN\nCASE NO.: 2021AP938-CR\nCOMPLETE TITLE: State of Wisconsin,""",
{
"Citation": {
"volume": "2023",
"reporter": "WI",
"page": "50",
"type": 8,
},
"Citation": "2023 WI 50",
},
)
],
Expand All @@ -523,12 +458,7 @@ class ScraperExtractFromText(unittest.TestCase):
# https://www.wicourts.gov/ca/opinion/DisplayDocument.pdf?content=pdf&seqNo=799325
"""2024 WI App 36\nCOURT OF APPEALS OF WISCONSIN\nPUBLISHED OPINION""",
{
"Citation": {
"volume": "2024",
"reporter": "WI App",
"page": "36",
"type": 8,
},
"Citation": "2024 WI App 36",
},
)
],
Expand Down Expand Up @@ -600,12 +530,7 @@ class ScraperExtractFromText(unittest.TestCase):
# https://www.courtlistener.com/api/rest/v3/opinions/10566596/
"""NOTICE: This opinion is subject to motions for reargument under V.R.A.P. 40 as well as formal\nrevision before publication in the Vermont Reports. Readers are requested to notify the Reporter\nof Decisions by email at: [email protected] or by mail at: Vermont Supreme Court, 109\nState Street, Montpelier, Vermont 05609-0801, of any errors in order that corrections may be made\nbefore this opinion goes to press.\n\n\n 2024 VT 52\n\n No. 23-AP-226\n\nState of Vermont """,
{
"Citation": {
"volume": "2024",
"reporter": "VT",
"page": "52",
"type": 8,
}
"Citation": "2024 VT 52",
},
)
],
Expand All @@ -614,12 +539,7 @@ class ScraperExtractFromText(unittest.TestCase):
# https://www.courtlistener.com/api/rest/v3/clusters/7854285/
"""NOTICE: This opinion is subject to motions for reargument under V.R.A.P. 40 as well as formal\nrevision before publication in the Vermont Reports. Readers are requested to notify the Reporter\nof Decisions by email at: [email protected] or by mail at: Vermont Supreme Court, 109\nState Street, Montpelier, Vermont 05609-0801, of any errors in order that corrections may be made\nbefore this opinion goes to press.\n\n\n 2022 VT 35\n\n No. 2021-059\n\nState of Vermont Supreme Court\n\n On Appeal from\n v. Superior Court, Chittenden Unit,\n Criminal Division\n\nRandy F. Therrien """,
{
"Citation": {
"volume": "2022",
"reporter": "VT",
"page": "35",
"type": 8,
}
"Citation": "2022 VT 35",
},
)
],
Expand Down Expand Up @@ -696,12 +616,7 @@ class ScraperExtractFromText(unittest.TestCase):
(
"J-A13044-21\n\n 2021 PA Super 113\n\n\n COMMONWEALTH OF PENNSYLVANIA : IN THE SUPERIOR COURT OF\n : PENNSYLVANIA\n :\n ",
{
"Citation": {
"volume": "2021",
"reporter": "PA Super",
"page": "113",
"type": 8,
}
"Citation": "2021 PA Super 113",
},
)
],
Expand Down
Loading