diff --git a/cl/lib/utils.py b/cl/lib/utils.py index 047389acd6..5e785ed659 100644 --- a/cl/lib/utils.py +++ b/cl/lib/utils.py @@ -251,7 +251,10 @@ def cleanup_main_query(query_string: str) -> str: cleaned_items = [] # Replace smart quotes with standard double quotes for consistency. query_string = re.sub(r"[“”]", '"', query_string) - for item in re.split(r'([^a-zA-Z0-9_\-^~":]+)', query_string): + # Tweaks to the following regex for special characters exceptions + # like §, $, %, and ¶ should also be applied to type_table in + # custom_word_delimiter_filter. + for item in re.split(r'([^a-zA-Z0-9_\-^~":§$%¶]+)', query_string): if not item: continue diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 82784ce5ea..16615e3e03 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -422,13 +422,18 @@ def setUpTestData(cls): sub_opinions=RelatedFactory( OpinionWithChildrenFactory, factory_related_name="cluster", - html_columbia="
Code, § 1-815
", + html_columbia="Code, § 1-815 Lorem §247 $247 %247 ¶247
", ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) OpinionClusterFactoryWithChildrenAndParents( case_name="Strickland v. Lorem.", docket=DocketFactory(court=cls.court, docket_number="123456"), + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Random plain_text", + ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) OpinionClusterFactoryWithChildrenAndParents( @@ -436,6 +441,11 @@ def setUpTestData(cls): docket=DocketFactory( court=cls.child_court_1, docket_number="34-2535" ), + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Lorem 247", + ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) OpinionClusterFactoryWithChildrenAndParents( @@ -443,6 +453,11 @@ def setUpTestData(cls): docket=DocketFactory( court=cls.child_court_2_2, docket_number="36-2000" ), + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Random plain_text", + ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) @@ -451,6 +466,11 @@ def setUpTestData(cls): docket=DocketFactory( court=cls.child_gand_2, docket_number="38-1000" ), + sub_opinions=RelatedFactory( + OpinionWithChildrenFactory, + factory_related_name="cluster", + plain_text="Random plain_text", + ), precedential_status=PRECEDENTIAL_STATUS.PUBLISHED, ) call_command( @@ -838,6 +858,59 @@ def test_raise_forbidden_error_on_depth_pagination(self) -> None: ) self.assertEqual(r.status_code, HTTPStatus.FORBIDDEN) + async def test_avoid_splitting_terms_on_special_chars(self) -> None: + """Can we avoid splitting words in queries such as §247 and phrases + like "§247"? + """ + + special_chars_exceptions = ["§", "$", "%", "¶"] + # A search for phrase "§247" shouldn't match "247" + for special_char in special_chars_exceptions: + with self.subTest( + special_char=special_char, msg="Phrase query and special char." + ): + r = await self.async_client.get( + reverse("show_results"), {"q": f'"{special_char}247"'} + ) + actual = self.get_article_count(r) + self.assertEqual( + actual, 1, msg="Didn't get the right number of results" + ) + self.assertIn("1:21-cv-1234", r.content.decode()) + + # A search for phrase "247" shouldn't match "§247" + r = await self.async_client.get( + reverse("show_results"), {"q": '"247"'} + ) + actual = self.get_article_count(r) + self.assertEqual( + actual, 1, msg="Didn't get the right number of results" + ) + self.assertIn("34-2535", r.content.decode()) + + # A search for §247 shouldn't match 247 + for special_char in special_chars_exceptions: + with self.subTest( + special_char=special_char, + msg="Non-phrase query and special char.", + ): + r = await self.async_client.get( + reverse("show_results"), {"q": f"{special_char}247"} + ) + actual = self.get_article_count(r) + self.assertEqual( + actual, 1, msg="Didn't get the right number of results" + ) + self.assertIn("1:21-cv-1234", r.content.decode()) + + # A search for 247 shouldn't match §247 + r = await self.async_client.get(reverse("show_results"), {"q": "247"}) + actual = self.get_article_count(r) + self.assertEqual( + actual, 1, msg="Didn't get the right number of results" + ) + self.assertIn("34-2535", r.content.decode()) + class SearchAPIV4CommonTest(ESIndexTestCase, TestCase): """Common tests for the Search API V4 endpoints.""" @@ -991,6 +1064,10 @@ def test_query_cleanup_function(self) -> None: '"this is a test" ~2 and "net neutrality" ~5 and 22cv3332', '"this is a test"~2 and "net neutrality"~5 and docketNumber:"22-cv-3332"~1', ), + ("§242", "§242"), + ("$242", "$242"), + ("%242", "%242"), + ("¶242", "¶242"), ) for q, a in q_a: print("Does {q} --> {a} ? ".format(**{"q": q, "a": a})) diff --git a/cl/settings/third_party/elasticsearch.py b/cl/settings/third_party/elasticsearch.py index 7a1ec6b779..ee0cef0fc3 100644 --- a/cl/settings/third_party/elasticsearch.py +++ b/cl/settings/third_party/elasticsearch.py @@ -125,6 +125,12 @@ "filter": { "custom_word_delimiter_filter": { "type": "word_delimiter", + "type_table": [ + "§ => ALPHANUM", + "$ => ALPHANUM", + "% => ALPHANUM", + "¶ => ALPHANUM", + ], "split_on_numerics": False, "preserve_original": True, },