From 962af9c80b856d4c32eefad7589c5ca12e2a4b81 Mon Sep 17 00:00:00 2001 From: Mike Date: Wed, 26 Jun 2024 13:50:40 +1200 Subject: [PATCH] Revert to pyphonetics (#46) --- pyproject.toml | 2 +- requirements.txt | 16 +++--- src/hdx/location/adminlevel.py | 20 +++---- src/hdx/location/names.py | 4 +- src/hdx/location/phonetics.py | 20 +++---- tests/fixtures/adminlevel.yaml | 1 - tests/fixtures/adminlevelparent.yaml | 7 ++- tests/hdx/location/test_adminlevel.py | 77 ++++++++++----------------- 8 files changed, 61 insertions(+), 86 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8255e81..007ca74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ requires-python = ">=3.8" dependencies = [ "hdx-python-utilities>=3.7.0", "libhxl>=5.2.1", - "rapidfuzz", + "pyphonetics", ] dynamic = ["version"] diff --git a/requirements.txt b/requirements.txt index 3b8d43d..25c448a 100755 --- a/requirements.txt +++ b/requirements.txt @@ -18,13 +18,13 @@ charset-normalizer==3.3.2 # via requests click==8.1.7 # via typer -coverage==7.5.3 +coverage==7.5.4 # via pytest-cov distlib==0.3.8 # via virtualenv et-xmlfile==1.1.0 # via openpyxl -filelock==3.15.3 +filelock==3.15.4 # via virtualenv frictionless==5.17.0 # via hdx-python-utilities @@ -60,7 +60,7 @@ loguru==0.7.2 # via hdx-python-utilities markdown-it-py==3.0.0 # via rich -marko==2.1.1 +marko==2.1.2 # via frictionless markupsafe==2.1.5 # via jinja2 @@ -90,6 +90,8 @@ pydantic-core==2.18.4 # via pydantic pygments==2.18.0 # via rich +pyphonetics==0.5.3 + # via hdx-python-country (pyproject.toml) pytest==8.2.2 # via # hdx-python-country (pyproject.toml) @@ -110,8 +112,6 @@ pyyaml==6.0.1 # frictionless # pre-commit # tableschema-to-template -rapidfuzz==3.9.3 - # via hdx-python-country (pyproject.toml) ratelimit==2.2.1 # via hdx-python-utilities referencing==0.35.1 @@ -164,14 +164,16 @@ typing-extensions==4.12.2 # pydantic-core # typer unidecode==1.3.8 - # via libhxl + # via + # libhxl + # pyphonetics urllib3==2.2.2 # via # libhxl # requests validators==0.28.3 # via frictionless -virtualenv==20.26.2 +virtualenv==20.26.3 # via pre-commit wheel==0.43.0 # via libhxl diff --git a/src/hdx/location/adminlevel.py b/src/hdx/location/adminlevel.py index 46ee760..30a54ab 100755 --- a/src/hdx/location/adminlevel.py +++ b/src/hdx/location/adminlevel.py @@ -5,6 +5,7 @@ import hxl from hxl import InputOptions from hxl.input import HXLIOException +from unidecode import unidecode from hdx.location.country import Country from hdx.location.names import clean_name @@ -146,7 +147,7 @@ def setup_row( self.pcode_to_name[pcode] = adm_name name_to_pcode = self.name_to_pcode.get(countryiso3, {}) - name_to_pcode[clean_name(adm_name)] = pcode + name_to_pcode[unidecode(adm_name).lower()] = pcode self.name_to_pcode[countryiso3] = name_to_pcode self.pcode_to_iso3[pcode] = countryiso3 self.pcode_to_iso3[pcode] = countryiso3 @@ -156,7 +157,7 @@ def setup_row( countryiso3, {} ) name_to_pcode = name_parent_to_pcode.get(parent, {}) - name_to_pcode[clean_name(adm_name)] = pcode + name_to_pcode[unidecode(adm_name).lower()] = pcode name_parent_to_pcode[parent] = name_to_pcode self.name_parent_to_pcode[countryiso3] = name_parent_to_pcode self.pcode_to_parent[pcode] = parent @@ -641,25 +642,22 @@ def fuzzy_pcode( break if not pcode: map_names = list(name_to_pcode.keys()) + lower_mapnames = [x.lower() for x in map_names] def al_transform_1(name): - prefix = name[:3] - if prefix == "al ": + if name[:3] == "al ": return f"ad {name[3:]}" - elif prefix == "ad ": - return f"al {name[3:]}" else: return None def al_transform_2(name): - prefix = name[:3] - if prefix == "al " or prefix == "ad ": + if name[:3] == "al ": return name[3:] else: return None matching_index = self.phonetics.match( - map_names, + lower_mapnames, adm_name_lookup, alternative_name=adm_name_lookup2, transform_possible_names=[al_transform_1, al_transform_2], @@ -716,7 +714,6 @@ def get_pcode( countryiso3: str, name: str, fuzzy_match: bool = True, - fuzzy_length: int = 4, **kwargs: Any, ) -> Tuple[Optional[str], bool]: """Get pcode for a given name @@ -725,7 +722,6 @@ def get_pcode( countryiso3 (str): ISO3 country code name (str): Name to match fuzzy_match (bool): Whether to try fuzzy matching. Defaults to True. - fuzzy_length (int): Minimum length for fuzzy matching. Defaults to 4. **kwargs: parent (Optional[str]): Parent admin code logname (str): Log using this identifying name. Defaults to not logging. @@ -771,7 +767,7 @@ def get_pcode( pcode = name_to_pcode.get(name.lower()) if pcode: return pcode, True - if not fuzzy_match or len(name) < fuzzy_length: + if not fuzzy_match: return None, True pcode = self.fuzzy_pcode(countryiso3, name, **kwargs) return pcode, False diff --git a/src/hdx/location/names.py b/src/hdx/location/names.py index 53a810e..40d1dcb 100644 --- a/src/hdx/location/names.py +++ b/src/hdx/location/names.py @@ -3,7 +3,7 @@ from unidecode import unidecode -non_ascii = r"([^\x00-\x7f])+" +non_ascii = "([^\x00-\x7f])+" def clean_name(name: str) -> str: @@ -25,7 +25,5 @@ def clean_name(name: str) -> str: ) # Remove all non-ASCII characters clean_name = re.sub(non_ascii, " ", clean_name) - clean_name = clean_name.replace("'", "") - clean_name = re.sub(r"[\W_]", " ", clean_name) clean_name = unidecode(clean_name) return clean_name.strip().lower() diff --git a/src/hdx/location/phonetics.py b/src/hdx/location/phonetics.py index 2b6fb22..a148295 100644 --- a/src/hdx/location/phonetics.py +++ b/src/hdx/location/phonetics.py @@ -1,18 +1,18 @@ from typing import Callable, Optional -from rapidfuzz import fuzz +import pyphonetics from hdx.utilities.typehint import ListTuple -class Phonetics: +class Phonetics(pyphonetics.RefinedSoundex): def match( self, possible_names: ListTuple, name: str, alternative_name: Optional[str] = None, transform_possible_names: ListTuple[Callable] = [], - threshold: float = 60, + threshold: int = 2, ) -> Optional[int]: """ Match name to one of the given possible names. Returns None if no match @@ -23,22 +23,22 @@ def match( name (str): Name to match alternative_name (str): Alternative name to match. Defaults to None. transform_possible_names (ListTuple[Callable]): Functions to transform possible names. - threshold (float): Match threshold. Value is 0-100. Defaults to 60. + threshold: Match threshold. Defaults to 2. Returns: Optional[int]: Index of matching name from possible names or None """ - max_similarity = 0 + mindistance = None matching_index = None transform_possible_names.insert(0, lambda x: x) def check_name(name, possible_name): - nonlocal max_similarity, matching_index # noqa: E999 + nonlocal mindistance, matching_index # noqa: E999 - similarity = fuzz.token_sort_ratio(name, possible_name) - if similarity > max_similarity: - max_similarity = similarity + distance = self.distance(name, possible_name) + if mindistance is None or distance < mindistance: + mindistance = distance matching_index = i for i, possible_name in enumerate(possible_names): @@ -51,6 +51,6 @@ def check_name(name, possible_name): check_name(name, transformed_possible_name) if alternative_name: check_name(alternative_name, transformed_possible_name) - if max_similarity < threshold: + if mindistance is None or mindistance > threshold: return None return matching_index diff --git a/tests/fixtures/adminlevel.yaml b/tests/fixtures/adminlevel.yaml index 8474afa..17fdc85 100755 --- a/tests/fixtures/adminlevel.yaml +++ b/tests/fixtures/adminlevel.yaml @@ -434,7 +434,6 @@ admin_info: - {pcode: XYZ123456, name: Random, iso3: XYZ} countries_fuzzy_try: - - AFG - NER - NGA - UKR diff --git a/tests/fixtures/adminlevelparent.yaml b/tests/fixtures/adminlevelparent.yaml index d7f2a9a..dec4134 100644 --- a/tests/fixtures/adminlevelparent.yaml +++ b/tests/fixtures/adminlevelparent.yaml @@ -16,11 +16,10 @@ admin_name_mappings: "AF05|MyMapping3": "AF0501" admin_name_replacements: - "wx": "an" + " city": "" alt1_admin_name_replacements: - "COD|wx": "an" + "COD| city": "" alt2_admin_name_replacements: - "CD20|wx": "an" - "CD31|wx": "en" + "CD20| city": "" diff --git a/tests/hdx/location/test_adminlevel.py b/tests/hdx/location/test_adminlevel.py index 9011a59..09443c2 100755 --- a/tests/hdx/location/test_adminlevel.py +++ b/tests/hdx/location/test_adminlevel.py @@ -182,10 +182,6 @@ def test_adminlevel(self, config): def test_adminlevel_fuzzy(self, config): adminone = AdminLevel(config) adminone.setup_from_admin_info(config["admin_info"]) - assert adminone.get_pcode("YEM", "Al_Dhale'a", logname="test") == ( - "YE30", - False, - ) assert adminone.get_pcode("YEM", "Al Dali", logname="test") == ( "YE30", False, @@ -196,24 +192,14 @@ def test_adminlevel_fuzzy(self, config): "YE30", False, ) - assert adminone.get_pcode("YEM", "Ad Jauf", logname="test") == ( - "YE16", - False, - ) - assert adminone.get_pcode("AFG", "Sar-E-Pul", logname="test") == ( - "AF22", - False, - ) assert adminone.get_pcode("SOM", "Bay", logname="test") == ( "SO24", True, ) output = adminone.output_matches() assert output == [ - "test - YEM: Matching (fuzzy) Ad Jauf to Al Jawf on map", "test - YEM: Matching (fuzzy) Al Dali to Ad Dali on map", "test - YEM: Matching (fuzzy) Al Dhale'e / الضالع to Ad Dali on map", - "test - YEM: Matching (fuzzy) Al_Dhale'a to Ad Dali on map", ] def test_adminlevel_parent(self, config_parent): @@ -295,19 +281,21 @@ def test_adminlevel_parent(self, config_parent): ) == (None, False) output = admintwo.output_admin_name_replacements() - assert output == ["wx: an"] + assert output == [" city: "] assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", logname="test" + "COD", "Mbanza-Ngungu city", logname="test" ) == ("CD2013", False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD20", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD20", logname="test" ) == ("CD2013", False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD19", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD19", logname="test" ) == (None, False) - assert admintwo.get_pcode( - "MWI", "City of Blwxtyre", logname="test" - ) == ( + assert admintwo.get_pcode("COD", "Kenge city", logname="test") == ( + "CD3102", + False, + ) + assert admintwo.get_pcode("MWI", "Blantyre city", logname="test") == ( "MW305", False, ) @@ -316,66 +304,58 @@ def test_adminlevel_parent(self, config_parent): "alt1_admin_name_replacements" ] output = admintwo.output_admin_name_replacements() - assert output == ["COD|wx: an"] + assert output == ["COD| city: "] assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", logname="test" + "COD", "Mbanza-Ngungu city", logname="test" ) == ("CD2013", False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD20", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD20", logname="test" ) == ("CD2013", False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD19", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD19", logname="test" ) == (None, False) - assert admintwo.get_pcode("COD", "Kenge City", logname="test") == ( + assert admintwo.get_pcode("COD", "Kenge city", logname="test") == ( "CD3102", False, ) - assert admintwo.get_pcode("COD", "Kwxge City", logname="test") == ( - None, - False, - ) - assert admintwo.get_pcode( - "COD", "Kwxge City", parent="CD31", logname="test" - ) == (None, False) assert admintwo.get_pcode( - "MWI", "City of Blwxtyre", logname="test" - ) == ( + "COD", "Kenge city", parent="CD31", logname="test" + ) == ("CD3102", False) + assert admintwo.get_pcode("MWI", "Blantyre city", logname="test") == ( None, False, ) assert admintwo.get_pcode( - "MWI", "City of Blwxtyre", parent="MW3", logname="test" + "MWI", "Blantyre city", parent="MW3", logname="test" ) == (None, False) admintwo.admin_name_replacements = config_parent[ "alt2_admin_name_replacements" ] output = admintwo.output_admin_name_replacements() - assert output == ["CD20|wx: an", "CD31|wx: en"] + assert output == ["CD20| city: "] assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", logname="test" + "COD", "Mbanza-Ngungu city", logname="test" ) == (None, False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD20", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD20", logname="test" ) == ("CD2013", False) assert admintwo.get_pcode( - "COD", "City of Mbwxza-Nanga", parent="CD19", logname="test" + "COD", "Mbanza-Ngungu city", parent="CD19", logname="test" ) == (None, False) - assert admintwo.get_pcode("COD", "Kwxge City", logname="test") == ( + assert admintwo.get_pcode("COD", "Kenge city", logname="test") == ( None, False, ) assert admintwo.get_pcode( - "COD", "Kwxge City", parent="CD31", logname="test" - ) == ("CD3102", False) - assert admintwo.get_pcode( - "MWI", "City of Blwxtyre", logname="test" - ) == ( + "COD", "Kenge city", parent="CD31", logname="test" + ) == (None, False) + assert admintwo.get_pcode("MWI", "Blantyre city", logname="test") == ( None, False, ) assert admintwo.get_pcode( - "MWI", "City of Blwxtyre", parent="MW3", logname="test" + "MWI", "Blantyre city", parent="MW3", logname="test" ) == (None, False) def test_adminlevel_with_url(self, config, url): @@ -477,7 +457,7 @@ def test_adminlevel_with_url(self, config, url): ) assert adminone.get_pcode("YEM", "Ad Dali", logname="test") == ( "YE30", - True, + False, ) assert adminone.get_pcode("YEM", "Ad Dal", logname="test") == ( "YE30", @@ -520,6 +500,7 @@ def test_adminlevel_with_url(self, config, url): "test - NGA: Matching (pcode length conversion) NG015 to Federal Capital Territory on map", "test - UKR: Matching (substring) Chernihiv Oblast to Chernihivska on map", "test - YEM: Matching (substring) Ad Dal to Ad Dali' on map", + "test - YEM: Matching (substring) Ad Dali to Ad Dali' on map", "test - YEM: Matching (pcode length conversion) YE30 to Ad Dali' on map", ] output = adminone.output_ignored()