Skip to content

Commit

Permalink
Merge pull request #4499 from freelawproject/add-harvard-to-citation-…
Browse files Browse the repository at this point in the history
…parsing

Add XML Harvard to html citation generation
  • Loading branch information
flooie authored Dec 12, 2024
2 parents ec93ad5 + 28a7780 commit 0771925
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 1 deletion.
13 changes: 12 additions & 1 deletion cl/citations/annotate_citations.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import html
import re
from typing import Dict, List

from eyecite import annotate_citations, clean_text
Expand All @@ -15,10 +16,20 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
:param document: The Opinion or RECAPDocument whose text should be parsed
"""
for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]:

# We prefer CAP data (xml_harvard) first.
for attr in [
"xml_harvard",
"html_anon_2020",
"html_columbia",
"html_lawbox",
"html",
]:
text = getattr(document, attr, None)
if text:
document.source_text = text
# Remove XML encodings from xml_harvard
text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
document.cleaned_text = clean_text(
text, ["html", "all_whitespace"]
)
Expand Down
42 changes: 42 additions & 0 deletions cl/citations/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,48 @@ def test_make_html_from_html(self) -> None:
msg=f"\n{created_html}\n\n !=\n\n{expected_html}",
)

def test_make_html_from_harvard_xml(self) -> None:
"""Can we convert the XML of an opinion into modified HTML?"""
# fmt: off

test_pairs = [
# Citation with XML encoding
('<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
'<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
'<em> United States,</em> 196 U. S. 375:</p></opinion>',
'<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
'<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
'<em> United States,</em> '
'<span class="citation no-link">196 U. S. 375</span>:</p>'
'</opinion>'),
]

# fmt: on
for s, expected_html in test_pairs:
with self.subTest(
f"Testing html to html conversion for {s}...",
s=s,
expected_html=expected_html,
):
opinion = Opinion(xml_harvard=s)
get_and_clean_opinion_text(opinion)
citations = get_citations(
opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER
)

# Stub out fake output from do_resolve_citations(), since the
# purpose of this test is not to test that. We just need
# something that looks like what create_cited_html() expects
# to receive.
citation_resolutions = {NO_MATCH_RESOURCE: citations}

created_html = create_cited_html(opinion, citation_resolutions)
self.assertEqual(
created_html,
expected_html,
msg=f"\n{created_html}\n\n !=\n\n{expected_html}",
)

def test_make_html_from_matched_citation_objects(self) -> None:
"""Can we render matched citation objects as HTML?"""
# This test case is similar to the two above, except it allows us to
Expand Down

0 comments on commit 0771925

Please sign in to comment.