Skip to content

Commit

Permalink
Create skip_cleaning and prefixes Name arguments
Browse files Browse the repository at this point in the history
Both ideas are discussed in #58.

- skip_cleaning completely skips multiple stages of cleaning names
and name parts.
- prefixes allows the user to modify the list of prefixes, exposed
from the default config.PREFIXES

This currently only affects the Name class and is not part of the
CLI or a full release while I consider:

1. How to implement a general way to override config lists such as
suffixes, conjunctions, titles, etc. It's currently possible only
to monkeypatch these, but since we can now adjust prefixes we
should extend this to other constants.
2. Whether and how to incorporate any of these changes into the
CLI. I'm leaning toward no, but perhaps --skip-cleaning is fine.
  • Loading branch information
vaneseltine committed Jul 30, 2024
1 parent f0879e3 commit f56c303
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 41 deletions.
69 changes: 46 additions & 23 deletions nominally/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def wrapper_bouncer(obj: T.Any, countable: WordContainer) -> WordContainer:
checklist = countable
else:
checklist = flatten_once(countable)
wordlist = [s for s in checklist if re.search("[a-z]", str(s))]
wordlist = [s for s in checklist if re.search(r"\S", str(s))]
if len(wordlist) < minimum:
return countable
return func(obj, countable)
Expand All @@ -57,21 +57,44 @@ class Name(MappingBase):

_keys = ["title", "first", "middle", "last", "suffix", "nickname"]
# https://github.com/vaneseltine/nominally/issues/47
__slots__ = _keys + ["_raw", "_has_generational", "detail", "_final", "_cleaned"]

def __init__(self, raw: str = "") -> None:

__slots__ = _keys + [
"_raw",
"_skip_cleaning",
"prefixes",
"_has_generational",
"detail",
"_final",
"_cleaned",
]

def __init__(
self,
raw: str = "",
*,
skip_cleaning: bool = False,
prefixes: T.Optional[T.Collection[str]] = None,
) -> None:
self._raw = raw
self._skip_cleaning = skip_cleaning
self.prefixes: T.Collection[str]
if prefixes is None:
self.prefixes = config.PREFIXES
else:
self.prefixes = prefixes

self._has_generational = False
self.detail: T.Dict[str, Cluster] = {k: [] for k in self._keys}

s = self._pre_clean(self.raw)
s = self._pre_process(s)
s = self.clean(s)
if skip_cleaning:
s = self._raw
else:
s = self._pre_clean(self.raw)
s = self._pre_process(s)
s = self.clean(s)
self._cleaned = self._archive_cleaned(s)
self._process(s)
self._post_process()
self._final = self._post_clean()
self._final = self._post_clean(skip_cleaning=self._skip_cleaning)

@staticmethod
def _pre_clean(s: str) -> str:
Expand Down Expand Up @@ -110,16 +133,16 @@ def clean(cls, s: str, *, condense: bool = False, final: bool = False) -> str:
(r"(\s*(;|:|,))+", ", "), # convert : ; , to , with spacing
(r"\.\s*", ". "), # reduce/add space after each .
(r"[-_/\\:]+", "-"), # convert _ / \ - : to single hyphen
(r"[^-\sa-z0-9,]+", ""), # drop most all excluding - , .
(r"[^-\sA-Za-z0-9,]+", ""), # drop most all excluding - , .
(r"\s+", whitespace_out), # condense all whitespace groups
]
if final:
cleaning_subs.append((r"[^a-z0-9- \)\()]", ""))
cleaning_subs.append((r"[^A-Za-z0-9- \)\()]", ""))
for pattern, repl in cleaning_subs:
s = re.sub(pattern, repl, s)
s = cls.strip_pointlessness(s)

if not re.search(r"[a-z]", s):
if not re.search(r"[A-Za-z]", s):
return ""
return s

Expand All @@ -137,7 +160,8 @@ def _process(self, preprocessed_str: str) -> None:
"""Primary processing of clusters into extracted name parts."""
clusters = self._string_to_clusters(preprocessed_str)
clusters = self._extract_title(clusters)
clusters = self._remove_numbers(clusters)
if not self._skip_cleaning:
clusters = self._remove_numbers(clusters)
clusters = self._grab_junior(clusters)
self._extract_last_first_middle(clusters)

Expand Down Expand Up @@ -224,7 +248,9 @@ def _deep_number_clean(cls, s: str) -> str:
no_numbers = re.sub(r"\d", "", s)
return cls.strip_pointlessness(no_numbers)

def _post_clean(self) -> T.Dict[str, Cluster]:
def _post_clean(self, skip_cleaning: bool = False) -> T.Dict[str, Cluster]:
if skip_cleaning:
return {k: self.detail[k] for k in self._keys}
return {k: self._clean_cluster(self.detail[k]) for k in self._keys}

@classmethod
Expand Down Expand Up @@ -307,20 +333,18 @@ def _flip_last_name_to_right(cls, clusters: Clusters) -> Clusters:
clusters[-1].append(partitioned_last)
return clusters

@classmethod
def _cluster_words(cls, cluster: Cluster) -> Cluster:
def _cluster_words(self, cluster: Cluster) -> Cluster:
"""
Split list of cluster down to individual words and
- join on conjuctions if appropriate
- add prefixes to last names if appropriate
"""
cluster = cls._combine_conjunctions(cluster)
cluster = cls._combine_rightmost_prefixes(cluster)
cluster = self._combine_conjunctions(cluster)
cluster = self._combine_rightmost_prefixes(cluster)
return cluster

@classmethod
@word_count_bouncer(minimum=4)
def _combine_conjunctions(cls, cluster: Cluster) -> Cluster:
def _combine_conjunctions(self, cluster: Cluster) -> Cluster:
"""Accept one conjunction at the end: `bob|steve|cortez y costa`"""
*new_cluster, last_name_one, conj, last_name_two = cluster

Expand All @@ -331,14 +355,13 @@ def _combine_conjunctions(cls, cluster: Cluster) -> Cluster:
new_cluster.append(rightmost)
return new_cluster

@classmethod
@word_count_bouncer(minimum=3)
def _combine_rightmost_prefixes(cls, cluster: Cluster) -> Cluster:
def _combine_rightmost_prefixes(self, cluster: Cluster) -> Cluster:
"""Work right-to-left through cluster, joining up prefixes of rightmost"""
result: Clusters = []

for word in reversed(cluster):
if len(result) > 1 or word not in config.PREFIXES:
if len(result) > 1 or word not in self.prefixes:
result.insert(0, [word])
continue
if not result:
Expand Down
5 changes: 0 additions & 5 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,6 @@ def get_pypi_version(encoding="utf-8"):
return pypi_json["info"]["version"]


@nox.session(python=False)
def lint_flake8(session):
session.run("flake8", ".")


@nox.session(python=False)
def lint_pylint(session):
for args in [PACKAGE_NAME, "test --rcfile=./test/pylintrc"]:
Expand Down
27 changes: 24 additions & 3 deletions test/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,36 @@
import json
import typing as T
from pathlib import Path

import pytest

TEST_DATA_DIRECTORY = Path(__file__).parent / "names"


def dict_entry_test(testclass, entry):
observed = dict(testclass(entry["raw"]))
def dict_entry_test(testclass: T.Callable, entry: T.Mapping[str, str], **kwargs):
"""
Simplified test interface for asserting a dictionary containing raw input and
expected results.
Arguments:
- testclass is the class under test (typically Name)
- entry is a dict containing a "raw" key and at least one other key to retrieve.
- **kwargs are passed on to testclass
Example:
dict_entry_test(
Name,
{"raw": "vimes, samuel x", "first": "samuel", "middle": "x", "last": "vimes"},
)
This ensures that dict(Class(raw)) produces the output name parts:
- dict(Class(raw))["first"] == "samuel"
- dict(Class(raw))["middle"] == "x"
- dict(Class(raw))["last"] == "vimes"
"""
observed = dict(testclass(entry["raw"], **kwargs))
expected = {key: entry.get(key, "") for key in observed.keys()}
expected["suffix"] = set(expected["suffix"].split())
expected["suffix"] = set(expected["suffix"].split()) # type: ignore
observed["suffix"] = set(observed["suffix"].split())
assert observed == expected

Expand Down
73 changes: 71 additions & 2 deletions test/test__issues.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,76 @@
"""
import pytest

from nominally import Name
from nominally.config import PREFIXES

from .conftest import dict_entry_test
"""


def issue_58_respect_basic_case():
dict_entry_test(
Name,
{
"raw": "Herve Corre",
"first": "Herve",
"last": "Corre",
},
skip_cleaning=True,
)


def issue_58_allow_skipping_cleaning_to_preserve_accents():
name = Name("Hervé Corre", skip_cleaning=True)
assert name.first == "Hervé"
assert name.last == "Corre"


@pytest.mark.parametrize(
"raw, first, middle, last",
[
("Antonia Ax:son Johnson", "Antonia", "Ax:son", "Johnson"),
("Elliot S! Maggin", "Elliot", "S!", "Maggin"),
],
)
def issue_58_allow_skipping_cleaning_to_preserve_punctuation(raw, first, middle, last):
name = Name(raw, skip_cleaning=True)
assert name.first == first
assert name.middle == middle
assert name.last == last


def issue_58_allow_skipping_cleaning_to_preserve_numbers():
name = Name("2 Chainz", skip_cleaning=True)
assert name.first == "2"
assert name.last == "Chainz"


def issue_58_allow_skipping_cleaning_to_preserve_capitals():
name = Name("Herve Le Corre", skip_cleaning=True)
assert name.first == "Herve"
assert name.middle == "Le"
assert name.last == "Corre"


def issue_58_nofix_two_name_raw_despite_prefix():
"""
This behavior is intended; an existing test uses "van nguyen" to capture the first
name "van."
"""
name = Name("de patris", prefixes={"de"})
assert name.first == "de"
assert name.last == "patris"


@pytest.mark.parametrize(
"prefix_arg, first, middle, last",
[
(None, "herve", "pfx", "corre"), # default argument goes to config.PREFIXES
({"pfx"} | PREFIXES, "herve", "", "pfx corre"), # Now "pfx" is part of suffixes
({"pfx"}, "herve", "", "pfx corre"), # Now "pfx" is the ONLY suffix
],
)
def issue_58_api_for_custom_prefixes(prefix_arg, first, middle, last):
name = Name("herve pfx corre", prefixes=prefix_arg)
assert name.first == first
assert name.middle == middle
assert name.last == last
19 changes: 11 additions & 8 deletions test/test_prefix_conjunction.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from nominally.parser import Name

from .conftest import dict_entry_test


Expand All @@ -9,15 +10,17 @@ def only_weird_prefixes():


@pytest.mark.parametrize(
"incoming, outgoing",
"raw, first, last",
(
(["oni", "de", "la", "soul"], ["oni", "de la soul"]),
(["oni", "bin", "baloney"], ["oni", "bin baloney"]),
(["oni", "van", "mooface"], ["oni", "van mooface"]),
("oni de la soul", "oni", "de la soul"),
("oni bin baloney", "oni", "bin baloney"),
("oni van mooface", "oni", "van mooface"),
),
)
def test_prefix_combining(incoming, outgoing):
assert Name._combine_rightmost_prefixes(incoming) == outgoing
def test_prefix_combining(raw, first, last):
name = Name(raw)
assert name.first == first
assert name.last == last


@pytest.mark.parametrize(
Expand All @@ -43,7 +46,7 @@ def test_prefix_avoid(static):
),
)
def test_conjunction_combine(incoming, outgoing):
assert Name._combine_conjunctions(incoming) == outgoing
assert Name("")._combine_conjunctions(incoming) == outgoing


@pytest.mark.parametrize(
Expand All @@ -55,7 +58,7 @@ def test_conjunction_combine(incoming, outgoing):
),
)
def test_conjunction_avoid(static):
assert Name._combine_conjunctions(static) == static
assert Name("")._combine_conjunctions(static) == static


@pytest.mark.parametrize(
Expand Down

0 comments on commit f56c303

Please sign in to comment.