diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index f4184d64dd..ea7d965f9e 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -1,4 +1,5 @@ import html +import re from typing import Dict, List from eyecite import annotate_citations, clean_text @@ -15,10 +16,20 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: :param document: The Opinion or RECAPDocument whose text should be parsed """ - for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]: + + # We prefer CAP data (xml_harvard) first. + for attr in [ + "xml_harvard", + "html_anon_2020", + "html_columbia", + "html_lawbox", + "html", + ]: text = getattr(document, attr, None) if text: document.source_text = text + # Remove XML encodings from xml_harvard + text = re.sub(r"^<\?xml.*?\?>", "", text, count=1) document.cleaned_text = clean_text( text, ["html", "all_whitespace"] ) diff --git a/cl/citations/tests.py b/cl/citations/tests.py index 313692b880..59b1e7a986 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -267,6 +267,48 @@ def test_make_html_from_html(self) -> None: msg=f"\n{created_html}\n\n !=\n\n{expected_html}", ) + def test_make_html_from_harvard_xml(self) -> None: + """Can we convert the XML of an opinion into modified HTML?""" + # fmt: off + + test_pairs = [ + # Citation with XML encoding + ('' + '

Swift & Co. v. ' + ' United States, 196 U. S. 375:

', + '' + '

Swift & Co. v. ' + ' United States, ' + '196 U. S. 375:

' + '
'), + ] + + # fmt: on + for s, expected_html in test_pairs: + with self.subTest( + f"Testing html to html conversion for {s}...", + s=s, + expected_html=expected_html, + ): + opinion = Opinion(xml_harvard=s) + get_and_clean_opinion_text(opinion) + citations = get_citations( + opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER + ) + + # Stub out fake output from do_resolve_citations(), since the + # purpose of this test is not to test that. We just need + # something that looks like what create_cited_html() expects + # to receive. + citation_resolutions = {NO_MATCH_RESOURCE: citations} + + created_html = create_cited_html(opinion, citation_resolutions) + self.assertEqual( + created_html, + expected_html, + msg=f"\n{created_html}\n\n !=\n\n{expected_html}", + ) + def test_make_html_from_matched_citation_objects(self) -> None: """Can we render matched citation objects as HTML?""" # This test case is similar to the two above, except it allows us to