Merge pull request #4499 from freelawproject/add-harvard-to-citation-…

…parsing Add XML Harvard to html citation generation
freelawproject · Dec 12, 2024 · 0771925 · 0771925
2 parents ec93ad5 + 28a7780
commit 0771925
Show file tree

Hide file tree

Showing 2 changed files with 54 additions and 1 deletion.
diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py
@@ -1,4 +1,5 @@
 import html
+import re
 from typing import Dict, List
 
 from eyecite import annotate_citations, clean_text
@@ -15,10 +16,20 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
 
     :param document: The Opinion or RECAPDocument whose text should be parsed
     """
-    for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]:
+
+    # We prefer CAP data (xml_harvard) first.
+    for attr in [
+        "xml_harvard",
+        "html_anon_2020",
+        "html_columbia",
+        "html_lawbox",
+        "html",
+    ]:
         text = getattr(document, attr, None)
         if text:
             document.source_text = text
+            # Remove XML encodings from xml_harvard
+            text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
             document.cleaned_text = clean_text(
                 text, ["html", "all_whitespace"]
             )

diff --git a/cl/citations/tests.py b/cl/citations/tests.py
@@ -267,6 +267,48 @@ def test_make_html_from_html(self) -> None:
                     msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
                 )
 
+    def test_make_html_from_harvard_xml(self) -> None:
+        """Can we convert the XML of an opinion into modified HTML?"""
+        # fmt: off
+
+        test_pairs = [
+            # Citation with XML encoding
+            ('<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
+             '<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
+             '<em> United States,</em> 196 U. S. 375:</p></opinion>',
+             '<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
+             '<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
+             '<em> United States,</em> '
+             '<span class="citation no-link">196 U. S. 375</span>:</p>'
+             '</opinion>'),
+        ]
+
+        # fmt: on
+        for s, expected_html in test_pairs:
+            with self.subTest(
+                f"Testing html to html conversion for {s}...",
+                s=s,
+                expected_html=expected_html,
+            ):
+                opinion = Opinion(xml_harvard=s)
+                get_and_clean_opinion_text(opinion)
+                citations = get_citations(
+                    opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER
+                )
+
+                # Stub out fake output from do_resolve_citations(), since the
+                # purpose of this test is not to test that. We just need
+                # something that looks like what create_cited_html() expects
+                # to receive.
+                citation_resolutions = {NO_MATCH_RESOURCE: citations}
+
+                created_html = create_cited_html(opinion, citation_resolutions)
+                self.assertEqual(
+                    created_html,
+                    expected_html,
+                    msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
+                )
+
     def test_make_html_from_matched_citation_objects(self) -> None:
         """Can we render matched citation objects as HTML?"""
         # This test case is similar to the two above, except it allows us to