Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix court string matching with whitespace #144

Merged
22 changes: 15 additions & 7 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
POST_SHORT_CITATION_REGEX,
YEAR_REGEX,
)
from eyecite.utils import strip_punct

BACKWARD_SEEK = 28 # Median case name length in the CL db is 28 (2016-02-26)

Expand All @@ -41,17 +40,26 @@ def get_court_by_paren(paren_string: str) -> Optional[str]:
Does not work on SCOTUS, since that court lacks parentheticals, and
needs to be handled after disambiguation has been completed.
"""
court_str = strip_punct(paren_string)

# Remove whitespace and punctuation because citation strings sometimes lack
# internal spaces, e.g. "Pa.Super." or "SC" (South Carolina)
court_str = re.sub(r"[^\w]", "", paren_string).lower()

court_code = None
if court_str:
# Map the string to a court, if possible.
for court in courts:
# Use startswith because citations are often missing final period,
# e.g. "2d Cir"
if court["citation_string"].startswith(court_str):
s = re.sub(r"[^\w]", "", court["citation_string"]).lower()

# Check for an exact match first
if s == court_str:
return str(court["id"])

# If no exact match, try to record a startswith match for possible
# eventual return
if s.startswith(court_str):
court_code = court["id"]
break

return court_code

return court_code

Expand Down
19 changes: 19 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,19 @@ def test_find_citations(self):
'defendant': 'test',
'court': 'ca4',
'pin_cite': '347-348'})]),
# Test with court string without space
('bob lissner v. test 1 U.S. 12, 347-348 (Pa.Super. 1982)',
[case_citation(page='12', year=1982,
metadata={'plaintiff': 'lissner',
'defendant': 'test',
'court': 'pasuperct',
'pin_cite': '347-348'})]),
# Test with court string exact match
('Commonwealth v. Muniz, 164 A.3d 1189 (Pa. 2017)',
[case_citation(page='1189', reporter='A.3d', volume='164', year=2017,
metadata={'plaintiff': 'Commonwealth',
'defendant': 'Muniz',
'court': 'pa'})]),
# Parallel cite with parenthetical
('bob lissner v. test 1 U.S. 12, 347-348, 1 S. Ct. 2, 358 (4th Cir. 1982) (overruling foo)',
[case_citation(page='12', year=1982,
Expand Down Expand Up @@ -475,6 +488,12 @@ def test_find_citations(self):
# Long pin cite -- make sure no catastrophic backtracking in regex
('1 U.S. 1, 2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291',
[case_citation(metadata={'pin_cite': '2277, 2278, 2279, 2280, 2281, 2282, 2283, 2284, 2286, 2287, 2288, 2289, 2290, 2291'})]),
('Commonwealth v. Muniz, 164 A.3d 1189 (Pa. 2017)', [
case_citation(volume='164', reporter='A.3d', year=2017,
page='1189',
metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz',
'court': 'pa'})]),
('Foo v. Bar, 1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]),
)
# fmt: on
self.run_test_pairs(test_pairs, "Citation extraction")
Expand Down
Loading