From 3c649dd12f5b70a8640cc76099f2256dc318f622 Mon Sep 17 00:00:00 2001 From: William Palin Date: Wed, 25 Sep 2024 19:16:54 -0400 Subject: [PATCH 1/4] feat(annotate_citations): Update citation parsing Add XML harvard to to get_and_clean_opinion_text so that we can find and add annotations in the harvard corpus --- cl/citations/annotate_citations.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index f4184d64dd..b94c52786f 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -15,8 +15,14 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: :param document: The Opinion or RECAPDocument whose text should be parsed """ - for attr in ["html_anon_2020", "html_columbia", "html_lawbox", "html"]: - text = getattr(document, attr, None) + for attr in [ + "xml_harvard", + "html_anon_2020", + "html_columbia", + "html_lawbox", + "html", + ]: + text = getattr(document, attr, None).encode("utf-8") if text: document.source_text = text document.cleaned_text = clean_text( From a6319573701545a0df84cd2177f6cf1b2838bcf2 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 26 Sep 2024 09:29:54 -0400 Subject: [PATCH 2/4] fix(annotate_citations): Fix lint --- cl/citations/annotate_citations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index b94c52786f..ebf3ef65f2 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -22,9 +22,9 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: "html_lawbox", "html", ]: - text = getattr(document, attr, None).encode("utf-8") + text = getattr(document, attr, None) if text: - document.source_text = text + document.source_text = text.encode("utf-8") document.cleaned_text = clean_text( text, ["html", "all_whitespace"] ) From 2d27a5f4ad127a7b902eda22f8c79af02733fdea Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 26 Sep 2024 11:35:22 -0400 Subject: [PATCH 3/4] fix(annotate_citations): Strip xml encoding Some harvard xml has encoding info which throws lxml for a loop. Remove it and add a test to ensure proper parsing. --- cl/citations/annotate_citations.py | 5 +++- cl/citations/tests.py | 42 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index ebf3ef65f2..89056f2b34 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -1,4 +1,5 @@ import html +import re from typing import Dict, List from eyecite import annotate_citations, clean_text @@ -24,7 +25,9 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: ]: text = getattr(document, attr, None) if text: - document.source_text = text.encode("utf-8") + document.source_text = text + # Remove XML encodings from xml_harvard + text = re.sub(r"^<\?xml.*?\?>", "", text, count=1) document.cleaned_text = clean_text( text, ["html", "all_whitespace"] ) diff --git a/cl/citations/tests.py b/cl/citations/tests.py index df8387d972..f047594268 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -276,6 +276,48 @@ def test_make_html_from_html(self) -> None: msg=f"\n{created_html}\n\n !=\n\n{expected_html}", ) + def test_make_html_from_harvard_xml(self) -> None: + """Can we convert the XML of an opinion into modified HTML?""" + # fmt: off + + test_pairs = [ + # Citation with XML encoding + ('' + '

Swift & Co. v. ' + ' United States, 196 U. S. 375:

', + '' + '

Swift & Co. v. ' + ' United States, ' + '196 U. S. 375:

' + '
'), + ] + + # fmt: on + for s, expected_html in test_pairs: + with self.subTest( + f"Testing html to html conversion for {s}...", + s=s, + expected_html=expected_html, + ): + opinion = Opinion(html=s) + get_and_clean_opinion_text(opinion) + citations = get_citations( + opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER + ) + + # Stub out fake output from do_resolve_citations(), since the + # purpose of this test is not to test that. We just need + # something that looks like what create_cited_html() expects + # to receive. + citation_resolutions = {NO_MATCH_RESOURCE: citations} + + created_html = create_cited_html(opinion, citation_resolutions) + self.assertEqual( + created_html, + expected_html, + msg=f"\n{created_html}\n\n !=\n\n{expected_html}", + ) + def test_make_html_from_matched_citation_objects(self) -> None: """Can we render matched citation objects as HTML?""" # This test case is similar to the two above, except it allows us to From b27a8bdb044ebc05b88fa5fd95f4d2d54134ba75 Mon Sep 17 00:00:00 2001 From: William Palin Date: Thu, 12 Dec 2024 14:41:11 -0500 Subject: [PATCH 4/4] fix(annotate_citations): Tweaks to PR Add comment on order and tweak test file to reflect xml ingestiong --- cl/citations/annotate_citations.py | 2 ++ cl/citations/tests.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cl/citations/annotate_citations.py b/cl/citations/annotate_citations.py index 89056f2b34..ea7d965f9e 100644 --- a/cl/citations/annotate_citations.py +++ b/cl/citations/annotate_citations.py @@ -16,6 +16,8 @@ def get_and_clean_opinion_text(document: Opinion | RECAPDocument) -> None: :param document: The Opinion or RECAPDocument whose text should be parsed """ + + # We prefer CAP data (xml_harvard) first. for attr in [ "xml_harvard", "html_anon_2020", diff --git a/cl/citations/tests.py b/cl/citations/tests.py index f4cb51d575..59b1e7a986 100644 --- a/cl/citations/tests.py +++ b/cl/citations/tests.py @@ -290,7 +290,7 @@ def test_make_html_from_harvard_xml(self) -> None: s=s, expected_html=expected_html, ): - opinion = Opinion(html=s) + opinion = Opinion(xml_harvard=s) get_and_clean_opinion_text(opinion) citations = get_citations( opinion.cleaned_text, tokenizer=HYPERSCAN_TOKENIZER