From 3c649dd12f5b70a8640cc76099f2256dc318f622 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Wed, 25 Sep 2024 19:16:54 -0400
Subject: [PATCH 1/4] feat(annotate_citations): Update citation parsing

Add XML harvard to to
get_and_clean_opinion_text
so that we can find and add
annotations in the harvard corpus
---
 cl/citations/annotate_citations.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py
index f4184d64dd..b94c52786f 100644
--- a/cl/citations/annotate_citations.py
+++ b/cl/citations/annotate_citations.py
@@ -15,8 +15,14 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
 
     :param document: The Opinion or RECAPDocument whose text should be parsed
     """
-    for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]:
-        text = getattr(document, attr, None)
+    for attr in [
+        "xml_harvard",
+        "html_anon_2020",
+        "html_columbia",
+        "html_lawbox",
+        "html",
+    ]:
+        text = getattr(document, attr, None).encode("utf-8")
         if text:
             document.source_text = text
             document.cleaned_text = clean_text(

From a6319573701545a0df84cd2177f6cf1b2838bcf2 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Thu, 26 Sep 2024 09:29:54 -0400
Subject: [PATCH 2/4] fix(annotate_citations): Fix lint

---
 cl/citations/annotate_citations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py
index b94c52786f..ebf3ef65f2 100644
--- a/cl/citations/annotate_citations.py
+++ b/cl/citations/annotate_citations.py
@@ -22,9 +22,9 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
         "html_lawbox",
         "html",
     ]:
-        text = getattr(document, attr, None).encode("utf-8")
+        text = getattr(document, attr, None)
         if text:
-            document.source_text = text
+            document.source_text = text.encode("utf-8")
             document.cleaned_text = clean_text(
                 text, ["html", "all_whitespace"]
             )

From 2d27a5f4ad127a7b902eda22f8c79af02733fdea Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Thu, 26 Sep 2024 11:35:22 -0400
Subject: [PATCH 3/4] fix(annotate_citations): Strip xml encoding

Some harvard xml has encoding info which
throws lxml for a loop.  Remove it and
add a test to ensure proper parsing.
---
 cl/citations/annotate_citations.py |  5 +++-
 cl/citations/tests.py              | 42 ++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py
index ebf3ef65f2..89056f2b34 100644
--- a/cl/citations/annotate_citations.py
+++ b/cl/citations/annotate_citations.py
@@ -1,4 +1,5 @@
 import html
+import re
 from typing import Dict, List
 
 from eyecite import annotate_citations, clean_text
@@ -24,7 +25,9 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
     ]:
         text = getattr(document, attr, None)
         if text:
-            document.source_text = text.encode("utf-8")
+            document.source_text = text
+            # Remove XML encodings from xml_harvard
+            text = re.sub(r"^<\?xml.*?\?>", "", text, count=1)
             document.cleaned_text = clean_text(
                 text, ["html", "all_whitespace"]
             )
diff --git a/cl/citations/tests.py b/cl/citations/tests.py
index df8387d972..f047594268 100644
--- a/cl/citations/tests.py
+++ b/cl/citations/tests.py
@@ -276,6 +276,48 @@ def test_make_html_from_html(self) -> None:
                     msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
                 )
 
+    def test_make_html_from_harvard_xml(self) -> None:
+        """Can we convert the XML of an opinion into modified HTML?"""
+        # fmt: off
+
+        test_pairs = [
+            # Citation with XML encoding
+            ('<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
+             '<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
+             '<em> United States,</em> 196 U. S. 375:</p></opinion>',
+             '<?xml version="1.0" encoding="utf-8"?><opinion type="majority">'
+             '<p id="b148-5"> <em> Swift &amp; Co. </em>v. '
+             '<em> United States,</em> '
+             '<span class="citation no-link">196 U. S. 375</span>:</p>'
+             '</opinion>'),
+        ]
+
+        # fmt: on
+        for s, expected_html in test_pairs:
+            with self.subTest(
+                f"Testing html to html conversion for {s}...",
+                s=s,
+                expected_html=expected_html,
+            ):
+                opinion = Opinion(html=s)
+                get_and_clean_opinion_text(opinion)
+                citations = get_citations(
+                    opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER
+                )
+
+                # Stub out fake output from do_resolve_citations(), since the
+                # purpose of this test is not to test that. We just need
+                # something that looks like what create_cited_html() expects
+                # to receive.
+                citation_resolutions = {NO_MATCH_RESOURCE: citations}
+
+                created_html = create_cited_html(opinion, citation_resolutions)
+                self.assertEqual(
+                    created_html,
+                    expected_html,
+                    msg=f"\n{created_html}\n\n    !=\n\n{expected_html}",
+                )
+
     def test_make_html_from_matched_citation_objects(self) -> None:
         """Can we render matched citation objects as HTML?"""
         # This test case is similar to the two above, except it allows us to

From b27a8bdb044ebc05b88fa5fd95f4d2d54134ba75 Mon Sep 17 00:00:00 2001
From: William Palin <bill@free.law>
Date: Thu, 12 Dec 2024 14:41:11 -0500
Subject: [PATCH 4/4] fix(annotate_citations): Tweaks to PR

Add comment on order
and tweak test file to reflect xml ingestiong
---
 cl/citations/annotate_citations.py | 2 ++
 cl/citations/tests.py              | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py
index 89056f2b34..ea7d965f9e 100644
--- a/cl/citations/annotate_citations.py
+++ b/cl/citations/annotate_citations.py
@@ -16,6 +16,8 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None:
 
     :param document: The Opinion or RECAPDocument whose text should be parsed
     """
+
+    # We prefer CAP data (xml_harvard) first.
     for attr in [
         "xml_harvard",
         "html_anon_2020",
diff --git a/cl/citations/tests.py b/cl/citations/tests.py
index f4cb51d575..59b1e7a986 100644
--- a/cl/citations/tests.py
+++ b/cl/citations/tests.py
@@ -290,7 +290,7 @@ def test_make_html_from_harvard_xml(self) -> None:
                 s=s,
                 expected_html=expected_html,
             ):
-                opinion = Opinion(html=s)
+                opinion = Opinion(xml_harvard=s)
                 get_and_clean_opinion_text(opinion)
                 citations = get_citations(
                     opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER