Skip to content

Commit

Permalink
Revert to pyphonetics (#46)
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans authored Jun 26, 2024
1 parent 993a5dd commit 962af9c
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 86 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ requires-python = ">=3.8"
dependencies = [
"hdx-python-utilities>=3.7.0",
"libhxl>=5.2.1",
"rapidfuzz",
"pyphonetics",
]
dynamic = ["version"]

Expand Down
16 changes: 9 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ charset-normalizer==3.3.2
# via requests
click==8.1.7
# via typer
coverage==7.5.3
coverage==7.5.4
# via pytest-cov
distlib==0.3.8
# via virtualenv
et-xmlfile==1.1.0
# via openpyxl
filelock==3.15.3
filelock==3.15.4
# via virtualenv
frictionless==5.17.0
# via hdx-python-utilities
Expand Down Expand Up @@ -60,7 +60,7 @@ loguru==0.7.2
# via hdx-python-utilities
markdown-it-py==3.0.0
# via rich
marko==2.1.1
marko==2.1.2
# via frictionless
markupsafe==2.1.5
# via jinja2
Expand Down Expand Up @@ -90,6 +90,8 @@ pydantic-core==2.18.4
# via pydantic
pygments==2.18.0
# via rich
pyphonetics==0.5.3
# via hdx-python-country (pyproject.toml)
pytest==8.2.2
# via
# hdx-python-country (pyproject.toml)
Expand All @@ -110,8 +112,6 @@ pyyaml==6.0.1
# frictionless
# pre-commit
# tableschema-to-template
rapidfuzz==3.9.3
# via hdx-python-country (pyproject.toml)
ratelimit==2.2.1
# via hdx-python-utilities
referencing==0.35.1
Expand Down Expand Up @@ -164,14 +164,16 @@ typing-extensions==4.12.2
# pydantic-core
# typer
unidecode==1.3.8
# via libhxl
# via
# libhxl
# pyphonetics
urllib3==2.2.2
# via
# libhxl
# requests
validators==0.28.3
# via frictionless
virtualenv==20.26.2
virtualenv==20.26.3
# via pre-commit
wheel==0.43.0
# via libhxl
Expand Down
20 changes: 8 additions & 12 deletions src/hdx/location/adminlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hxl
from hxl import InputOptions
from hxl.input import HXLIOException
from unidecode import unidecode

from hdx.location.country import Country
from hdx.location.names import clean_name
Expand Down Expand Up @@ -146,7 +147,7 @@ def setup_row(
self.pcode_to_name[pcode] = adm_name

name_to_pcode = self.name_to_pcode.get(countryiso3, {})
name_to_pcode[clean_name(adm_name)] = pcode
name_to_pcode[unidecode(adm_name).lower()] = pcode
self.name_to_pcode[countryiso3] = name_to_pcode
self.pcode_to_iso3[pcode] = countryiso3
self.pcode_to_iso3[pcode] = countryiso3
Expand All @@ -156,7 +157,7 @@ def setup_row(
countryiso3, {}
)
name_to_pcode = name_parent_to_pcode.get(parent, {})
name_to_pcode[clean_name(adm_name)] = pcode
name_to_pcode[unidecode(adm_name).lower()] = pcode
name_parent_to_pcode[parent] = name_to_pcode
self.name_parent_to_pcode[countryiso3] = name_parent_to_pcode
self.pcode_to_parent[pcode] = parent
Expand Down Expand Up @@ -641,25 +642,22 @@ def fuzzy_pcode(
break
if not pcode:
map_names = list(name_to_pcode.keys())
lower_mapnames = [x.lower() for x in map_names]

def al_transform_1(name):
prefix = name[:3]
if prefix == "al ":
if name[:3] == "al ":
return f"ad {name[3:]}"
elif prefix == "ad ":
return f"al {name[3:]}"
else:
return None

def al_transform_2(name):
prefix = name[:3]
if prefix == "al " or prefix == "ad ":
if name[:3] == "al ":
return name[3:]
else:
return None

matching_index = self.phonetics.match(
map_names,
lower_mapnames,
adm_name_lookup,
alternative_name=adm_name_lookup2,
transform_possible_names=[al_transform_1, al_transform_2],
Expand Down Expand Up @@ -716,7 +714,6 @@ def get_pcode(
countryiso3: str,
name: str,
fuzzy_match: bool = True,
fuzzy_length: int = 4,
**kwargs: Any,
) -> Tuple[Optional[str], bool]:
"""Get pcode for a given name
Expand All @@ -725,7 +722,6 @@ def get_pcode(
countryiso3 (str): ISO3 country code
name (str): Name to match
fuzzy_match (bool): Whether to try fuzzy matching. Defaults to True.
fuzzy_length (int): Minimum length for fuzzy matching. Defaults to 4.
**kwargs:
parent (Optional[str]): Parent admin code
logname (str): Log using this identifying name. Defaults to not logging.
Expand Down Expand Up @@ -771,7 +767,7 @@ def get_pcode(
pcode = name_to_pcode.get(name.lower())
if pcode:
return pcode, True
if not fuzzy_match or len(name) < fuzzy_length:
if not fuzzy_match:
return None, True
pcode = self.fuzzy_pcode(countryiso3, name, **kwargs)
return pcode, False
Expand Down
4 changes: 1 addition & 3 deletions src/hdx/location/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from unidecode import unidecode

non_ascii = r"([^\x00-\x7f])+"
non_ascii = "([^\x00-\x7f])+"


def clean_name(name: str) -> str:
Expand All @@ -25,7 +25,5 @@ def clean_name(name: str) -> str:
)
# Remove all non-ASCII characters
clean_name = re.sub(non_ascii, " ", clean_name)
clean_name = clean_name.replace("'", "")
clean_name = re.sub(r"[\W_]", " ", clean_name)
clean_name = unidecode(clean_name)
return clean_name.strip().lower()
20 changes: 10 additions & 10 deletions src/hdx/location/phonetics.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
from typing import Callable, Optional

from rapidfuzz import fuzz
import pyphonetics

from hdx.utilities.typehint import ListTuple


class Phonetics:
class Phonetics(pyphonetics.RefinedSoundex):
def match(
self,
possible_names: ListTuple,
name: str,
alternative_name: Optional[str] = None,
transform_possible_names: ListTuple[Callable] = [],
threshold: float = 60,
threshold: int = 2,
) -> Optional[int]:
"""
Match name to one of the given possible names. Returns None if no match
Expand All @@ -23,22 +23,22 @@ def match(
name (str): Name to match
alternative_name (str): Alternative name to match. Defaults to None.
transform_possible_names (ListTuple[Callable]): Functions to transform possible names.
threshold (float): Match threshold. Value is 0-100. Defaults to 60.
threshold: Match threshold. Defaults to 2.
Returns:
Optional[int]: Index of matching name from possible names or None
"""
max_similarity = 0
mindistance = None
matching_index = None

transform_possible_names.insert(0, lambda x: x)

def check_name(name, possible_name):
nonlocal max_similarity, matching_index # noqa: E999
nonlocal mindistance, matching_index # noqa: E999

similarity = fuzz.token_sort_ratio(name, possible_name)
if similarity > max_similarity:
max_similarity = similarity
distance = self.distance(name, possible_name)
if mindistance is None or distance < mindistance:
mindistance = distance
matching_index = i

for i, possible_name in enumerate(possible_names):
Expand All @@ -51,6 +51,6 @@ def check_name(name, possible_name):
check_name(name, transformed_possible_name)
if alternative_name:
check_name(alternative_name, transformed_possible_name)
if max_similarity < threshold:
if mindistance is None or mindistance > threshold:
return None
return matching_index
1 change: 0 additions & 1 deletion tests/fixtures/adminlevel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,6 @@ admin_info:
- {pcode: XYZ123456, name: Random, iso3: XYZ}

countries_fuzzy_try:
- AFG
- NER
- NGA
- UKR
Expand Down
7 changes: 3 additions & 4 deletions tests/fixtures/adminlevelparent.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,10 @@ admin_name_mappings:
"AF05|MyMapping3": "AF0501"

admin_name_replacements:
"wx": "an"
" city": ""

alt1_admin_name_replacements:
"COD|wx": "an"
"COD| city": ""

alt2_admin_name_replacements:
"CD20|wx": "an"
"CD31|wx": "en"
"CD20| city": ""
Loading

0 comments on commit 962af9c

Please sign in to comment.