Create skip_cleaning and prefixes Name arguments

Both ideas are discussed in #58. - skip_cleaning completely skips multiple stages of cleaning names and name parts. - prefixes allows the user to modify the list of prefixes, exposed from the default config.PREFIXES This currently only affects the Name class and is not part of the CLI or a full release while I consider: 1. How to implement a general way to override config lists such as suffixes, conjunctions, titles, etc. It's currently possible only to monkeypatch these, but since we can now adjust prefixes we should extend this to other constants. 2. Whether and how to incorporate any of these changes into the CLI. I'm leaning toward no, but perhaps --skip-cleaning is fine.
vaneseltine · Jul 30, 2024 · f56c303 · f56c303
1 parent f0879e3
commit f56c303
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 41 deletions.
diff --git a/nominally/parser.py b/nominally/parser.py
@@ -42,7 +42,7 @@ def wrapper_bouncer(obj: T.Any, countable: WordContainer) -> WordContainer:
                 checklist = countable
             else:
                 checklist = flatten_once(countable)
-            wordlist = [s for s in checklist if re.search("[a-z]", str(s))]
+            wordlist = [s for s in checklist if re.search(r"\S", str(s))]
             if len(wordlist) < minimum:
                 return countable
             return func(obj, countable)
@@ -57,21 +57,44 @@ class Name(MappingBase):
 
     _keys = ["title", "first", "middle", "last", "suffix", "nickname"]
     # https://github.com/vaneseltine/nominally/issues/47
-    __slots__ = _keys + ["_raw", "_has_generational", "detail", "_final", "_cleaned"]
-
-    def __init__(self, raw: str = "") -> None:
-
+    __slots__ = _keys + [
+        "_raw",
+        "_skip_cleaning",
+        "prefixes",
+        "_has_generational",
+        "detail",
+        "_final",
+        "_cleaned",
+    ]
+
+    def __init__(
+        self,
+        raw: str = "",
+        *,
+        skip_cleaning: bool = False,
+        prefixes: T.Optional[T.Collection[str]] = None,
+    ) -> None:
         self._raw = raw
+        self._skip_cleaning = skip_cleaning
+        self.prefixes: T.Collection[str]
+        if prefixes is None:
+            self.prefixes = config.PREFIXES
+        else:
+            self.prefixes = prefixes
+
         self._has_generational = False
         self.detail: T.Dict[str, Cluster] = {k: [] for k in self._keys}
 
-        s = self._pre_clean(self.raw)
-        s = self._pre_process(s)
-        s = self.clean(s)
+        if skip_cleaning:
+            s = self._raw
+        else:
+            s = self._pre_clean(self.raw)
+            s = self._pre_process(s)
+            s = self.clean(s)
         self._cleaned = self._archive_cleaned(s)
         self._process(s)
         self._post_process()
-        self._final = self._post_clean()
+        self._final = self._post_clean(skip_cleaning=self._skip_cleaning)
 
     @staticmethod
     def _pre_clean(s: str) -> str:
@@ -110,16 +133,16 @@ def clean(cls, s: str, *, condense: bool = False, final: bool = False) -> str:
             (r"(\s*(;|:|,))+", ", "),  # convert : ; , to , with spacing
             (r"\.\s*", ". "),  # reduce/add space after each .
             (r"[-_/\\:]+", "-"),  # convert _ / \ - : to single hyphen
-            (r"[^-\sa-z0-9,]+", ""),  # drop most all excluding -  , .
+            (r"[^-\sA-Za-z0-9,]+", ""),  # drop most all excluding -  , .
             (r"\s+", whitespace_out),  # condense all whitespace groups
         ]
         if final:
-            cleaning_subs.append((r"[^a-z0-9- \)\()]", ""))
+            cleaning_subs.append((r"[^A-Za-z0-9- \)\()]", ""))
         for pattern, repl in cleaning_subs:
             s = re.sub(pattern, repl, s)
         s = cls.strip_pointlessness(s)
 
-        if not re.search(r"[a-z]", s):
+        if not re.search(r"[A-Za-z]", s):
             return ""
         return s
 
@@ -137,7 +160,8 @@ def _process(self, preprocessed_str: str) -> None:
         """Primary processing of clusters into extracted name parts."""
         clusters = self._string_to_clusters(preprocessed_str)
         clusters = self._extract_title(clusters)
-        clusters = self._remove_numbers(clusters)
+        if not self._skip_cleaning:
+            clusters = self._remove_numbers(clusters)
         clusters = self._grab_junior(clusters)
         self._extract_last_first_middle(clusters)
 
@@ -224,7 +248,9 @@ def _deep_number_clean(cls, s: str) -> str:
         no_numbers = re.sub(r"\d", "", s)
         return cls.strip_pointlessness(no_numbers)
 
-    def _post_clean(self) -> T.Dict[str, Cluster]:
+    def _post_clean(self, skip_cleaning: bool = False) -> T.Dict[str, Cluster]:
+        if skip_cleaning:
+            return {k: self.detail[k] for k in self._keys}
         return {k: self._clean_cluster(self.detail[k]) for k in self._keys}
 
     @classmethod
@@ -307,20 +333,18 @@ def _flip_last_name_to_right(cls, clusters: Clusters) -> Clusters:
         clusters[-1].append(partitioned_last)
         return clusters
 
-    @classmethod
-    def _cluster_words(cls, cluster: Cluster) -> Cluster:
+    def _cluster_words(self, cluster: Cluster) -> Cluster:
         """
         Split list of cluster down to individual words and
             - join on conjuctions if appropriate
             - add prefixes to last names if appropriate
         """
-        cluster = cls._combine_conjunctions(cluster)
-        cluster = cls._combine_rightmost_prefixes(cluster)
+        cluster = self._combine_conjunctions(cluster)
+        cluster = self._combine_rightmost_prefixes(cluster)
         return cluster
 
-    @classmethod
     @word_count_bouncer(minimum=4)
-    def _combine_conjunctions(cls, cluster: Cluster) -> Cluster:
+    def _combine_conjunctions(self, cluster: Cluster) -> Cluster:
         """Accept one conjunction at the end: `bob|steve|cortez y costa`"""
         *new_cluster, last_name_one, conj, last_name_two = cluster
 
@@ -331,14 +355,13 @@ def _combine_conjunctions(cls, cluster: Cluster) -> Cluster:
         new_cluster.append(rightmost)
         return new_cluster
 
-    @classmethod
     @word_count_bouncer(minimum=3)
-    def _combine_rightmost_prefixes(cls, cluster: Cluster) -> Cluster:
+    def _combine_rightmost_prefixes(self, cluster: Cluster) -> Cluster:
         """Work right-to-left through cluster, joining up prefixes of rightmost"""
         result: Clusters = []
 
         for word in reversed(cluster):
-            if len(result) > 1 or word not in config.PREFIXES:
+            if len(result) > 1 or word not in self.prefixes:
                 result.insert(0, [word])
                 continue
             if not result:

diff --git a/noxfile.py b/noxfile.py
@@ -114,11 +114,6 @@ def get_pypi_version(encoding="utf-8"):
     return pypi_json["info"]["version"]
 
 
-@nox.session(python=False)
-def lint_flake8(session):
-    session.run("flake8", ".")
-
-
 @nox.session(python=False)
 def lint_pylint(session):
     for args in [PACKAGE_NAME, "test --rcfile=./test/pylintrc"]:

diff --git a/test/conftest.py b/test/conftest.py
@@ -1,15 +1,36 @@
 import json
+import typing as T
 from pathlib import Path
 
 import pytest
 
 TEST_DATA_DIRECTORY = Path(__file__).parent / "names"
 
 
-def dict_entry_test(testclass, entry):
-    observed = dict(testclass(entry["raw"]))
+def dict_entry_test(testclass: T.Callable, entry: T.Mapping[str, str], **kwargs):
+    """
+    Simplified test interface for asserting a dictionary containing raw input and
+    expected results.
+
+    Arguments:
+    - testclass is the class under test (typically Name)
+    - entry is a dict containing a "raw" key and at least one other key to retrieve.
+    - **kwargs are passed on to testclass
+
+    Example:
+
+    dict_entry_test(
+        Name,
+        {"raw": "vimes, samuel x", "first": "samuel", "middle": "x", "last": "vimes"},
+    )
+    This ensures that dict(Class(raw)) produces the output name parts:
+    - dict(Class(raw))["first"] == "samuel"
+    - dict(Class(raw))["middle"] == "x"
+    - dict(Class(raw))["last"] == "vimes"
+    """
+    observed = dict(testclass(entry["raw"], **kwargs))
     expected = {key: entry.get(key, "") for key in observed.keys()}
-    expected["suffix"] = set(expected["suffix"].split())
+    expected["suffix"] = set(expected["suffix"].split())  # type: ignore
     observed["suffix"] = set(observed["suffix"].split())
     assert observed == expected
 

diff --git a/test/test__issues.py b/test/test__issues.py
@@ -1,7 +1,76 @@
-"""
 import pytest
 
 from nominally import Name
+from nominally.config import PREFIXES
 
 from .conftest import dict_entry_test
-"""
+
+
+def issue_58_respect_basic_case():
+    dict_entry_test(
+        Name,
+        {
+            "raw": "Herve Corre",
+            "first": "Herve",
+            "last": "Corre",
+        },
+        skip_cleaning=True,
+    )
+
+
+def issue_58_allow_skipping_cleaning_to_preserve_accents():
+    name = Name("Hervé Corre", skip_cleaning=True)
+    assert name.first == "Hervé"
+    assert name.last == "Corre"
+
+
+@pytest.mark.parametrize(
+    "raw, first, middle, last",
+    [
+        ("Antonia Ax:son Johnson", "Antonia", "Ax:son", "Johnson"),
+        ("Elliot S! Maggin", "Elliot", "S!", "Maggin"),
+    ],
+)
+def issue_58_allow_skipping_cleaning_to_preserve_punctuation(raw, first, middle, last):
+    name = Name(raw, skip_cleaning=True)
+    assert name.first == first
+    assert name.middle == middle
+    assert name.last == last
+
+
+def issue_58_allow_skipping_cleaning_to_preserve_numbers():
+    name = Name("2 Chainz", skip_cleaning=True)
+    assert name.first == "2"
+    assert name.last == "Chainz"
+
+
+def issue_58_allow_skipping_cleaning_to_preserve_capitals():
+    name = Name("Herve Le Corre", skip_cleaning=True)
+    assert name.first == "Herve"
+    assert name.middle == "Le"
+    assert name.last == "Corre"
+
+
+def issue_58_nofix_two_name_raw_despite_prefix():
+    """
+    This behavior is intended; an existing test uses "van nguyen" to capture the first
+    name "van."
+    """
+    name = Name("de patris", prefixes={"de"})
+    assert name.first == "de"
+    assert name.last == "patris"
+
+
+@pytest.mark.parametrize(
+    "prefix_arg, first, middle, last",
+    [
+        (None, "herve", "pfx", "corre"),  # default argument goes to config.PREFIXES
+        ({"pfx"} | PREFIXES, "herve", "", "pfx corre"),  # Now "pfx" is part of suffixes
+        ({"pfx"}, "herve", "", "pfx corre"),  # Now "pfx" is the ONLY suffix
+    ],
+)
+def issue_58_api_for_custom_prefixes(prefix_arg, first, middle, last):
+    name = Name("herve pfx corre", prefixes=prefix_arg)
+    assert name.first == first
+    assert name.middle == middle
+    assert name.last == last
diff --git a/test/test_prefix_conjunction.py b/test/test_prefix_conjunction.py
@@ -1,6 +1,7 @@
 import pytest
 
 from nominally.parser import Name
+
 from .conftest import dict_entry_test
 
 
@@ -9,15 +10,17 @@ def only_weird_prefixes():
 
 
 @pytest.mark.parametrize(
-    "incoming, outgoing",
+    "raw, first, last",
     (
-        (["oni", "de", "la", "soul"], ["oni", "de la soul"]),
-        (["oni", "bin", "baloney"], ["oni", "bin baloney"]),
-        (["oni", "van", "mooface"], ["oni", "van mooface"]),
+        ("oni de la soul", "oni", "de la soul"),
+        ("oni bin baloney", "oni", "bin baloney"),
+        ("oni van mooface", "oni", "van mooface"),
     ),
 )
-def test_prefix_combining(incoming, outgoing):
-    assert Name._combine_rightmost_prefixes(incoming) == outgoing
+def test_prefix_combining(raw, first, last):
+    name = Name(raw)
+    assert name.first == first
+    assert name.last == last
 
 
 @pytest.mark.parametrize(
@@ -43,7 +46,7 @@ def test_prefix_avoid(static):
     ),
 )
 def test_conjunction_combine(incoming, outgoing):
-    assert Name._combine_conjunctions(incoming) == outgoing
+    assert Name("")._combine_conjunctions(incoming) == outgoing
 
 
 @pytest.mark.parametrize(
@@ -55,7 +58,7 @@ def test_conjunction_combine(incoming, outgoing):
     ),
 )
 def test_conjunction_avoid(static):
-    assert Name._combine_conjunctions(static) == static
+    assert Name("")._combine_conjunctions(static) == static
 
 
 @pytest.mark.parametrize(