Skip to content

Commit

Permalink
Add support for reading parent pcode and using parent pcode in lookup
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Mar 21, 2024
1 parent cbed6af commit a625397
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 37 deletions.
8 changes: 8 additions & 0 deletions documentation/main.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,13 @@ Method *setup_from_admin_info* takes key *admin_info* which is a list with
values of the form:

{"iso3": "AFG", "pcode": "AF01", "name": "Kabul"}
{"iso3": "AFG", "pcode": "AF0101", "name": "Kabul", "parent": "AF01"}

Dictionaries *pcode_to_name* and *pcode_to_iso3* are populated in the
AdminLevel object. *parent* is optional, but if provided enables lookup of
location names by both country and parent rather than just country which should
help with any name clashes. It also results in the population of a dictionary
in the AdminLevel object *pcode_to_parent*.

Method *setup_from_libhxl_dataset* takes a libhxl Dataset object, while
*setup_from_url* takes a URL which defaults to a resource in the global p-codes
Expand All @@ -144,6 +151,7 @@ Examples of usage:
adminlevel.get_pcode("YEM", "YEM030", logname="test") # returns ("YE30", True)
adminlevel.get_pcode("YEM", "Al Dhale"e / الضالع") # returns ("YE30", False)
adminlevel.get_pcode("YEM", "Al Dhale"e / الضالع", fuzzy_match=False) # returns (None, True)
assert admintwo.get_pcode("AFG", "Kabul", parent="AF01") == ("AF0101", True)

There is basic admin 1 p-code length conversion by default. A more advanced
p-code length conversion can be activated by calling *load_pcode_formats*
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ build-backend = "hatchling.build"
[project]
name = "hdx-python-country"
description = "HDX Python country code and exchange rate (fx) utilities"
authors = [{name = "Michael Rans", email = "[email protected]"}]
authors = [{name = "Michael Rans"}]
license = {text = "MIT"}
keywords = ["HDX", "location", "country", "country code", "iso 3166", "iso2", "iso3", "region", "fx", "currency", "currencies", "exchange rate", "foreign exchange"]
classifiers = [
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ urllib3==2.2.1
# via
# libhxl
# requests
validators==0.23.1
validators==0.23.2
# via frictionless
virtualenv==20.25.1
# via pre-commit
Expand Down
141 changes: 106 additions & 35 deletions src/hdx/location/adminlevel.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
import re
from typing import Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple

import hxl
from hxl import InputOptions
Expand Down Expand Up @@ -61,9 +61,12 @@ def __init__(
self.pcodes = []
self.pcode_lengths = {}
self.name_to_pcode = {}
self.name_parent_to_pcode = {}
self.pcode_to_name = {}
self.pcode_to_iso3 = {}
self.pcode_to_parent = {}
self.pcode_formats = {}
self.use_parent = False
self.zeroes = {}
self.parent_admins = []

Expand Down Expand Up @@ -120,15 +123,55 @@ def get_libhxl_dataset(cls, admin_url: str = _admin_url) -> hxl.Dataset:
)
raise

def setup_row(
self,
countryiso3: str,
pcode: str,
adm_name: str,
parent: Optional[str],
):
"""
Setup a single p-code
Args:
countryiso3 (str): Country
pcode (str): P-code
adm_name (str): Administrative name
parent (Optional[str]): Parent p-code
Returns:
None
"""
self.pcode_lengths[countryiso3] = len(pcode)
self.pcodes.append(pcode)
self.pcode_to_name[pcode] = adm_name

name_to_pcode = self.name_to_pcode.get(countryiso3, {})
name_to_pcode[unidecode(adm_name).lower()] = pcode
self.name_to_pcode[countryiso3] = name_to_pcode
self.pcode_to_iso3[pcode] = countryiso3
self.pcode_to_iso3[pcode] = countryiso3

if self.use_parent:
name_parent_to_pcode = self.name_parent_to_pcode.get(
countryiso3, {}
)
name_to_pcode = name_parent_to_pcode.get(parent, {})
name_to_pcode[unidecode(adm_name).lower()] = pcode
name_parent_to_pcode[parent] = name_to_pcode
self.name_parent_to_pcode[countryiso3] = name_parent_to_pcode
self.pcode_to_parent[pcode] = parent

def setup_from_admin_info(
self,
admin_info: ListTuple[Dict],
countryiso3s: Optional[ListTuple[str]] = None,
) -> None:
"""
Setup p-codes from admin_info which is a list with values of the form:
Setup p-codes from admin_info which is a list with values of the form
below with parent optional:
::
{"iso3": "AFG", "pcode": "AF01", "name": "Kabul"}
{"iso3": "AFG", "pcode": "AF0101", "name": "Kabul", parent: "AF01"}
Args:
admin_info (ListTuple[Dict]): p-code dictionary
countryiso3s (Optional[ListTuple[str]]): Countries to read. Defaults to None (all).
Expand All @@ -140,19 +183,15 @@ def setup_from_admin_info(
countryiso3s = [
countryiso3.upper() for countryiso3 in countryiso3s
]
self.use_parent = "parent" in admin_info[0]
for row in admin_info:
countryiso3 = row["iso3"].upper()
if countryiso3s and countryiso3 not in countryiso3s:
continue
pcode = row.get("pcode").upper()
self.pcodes.append(pcode)
self.pcode_lengths[countryiso3] = len(pcode)
adm_name = row["name"]
self.pcode_to_name[pcode] = adm_name
name_to_pcode = self.name_to_pcode.get(countryiso3, {})
name_to_pcode[unidecode(adm_name).lower()] = pcode
self.name_to_pcode[countryiso3] = name_to_pcode
self.pcode_to_iso3[pcode] = countryiso3
parent = row.get("parent")
self.setup_row(countryiso3, pcode, adm_name, parent)

def setup_from_libhxl_dataset(
self,
Expand All @@ -176,19 +215,15 @@ def setup_from_libhxl_dataset(
countryiso3s = [
countryiso3.upper() for countryiso3 in countryiso3s
]
self.use_parent = "#adm+code+parent" in admin_info.display_tags
for row in admin_info:
countryiso3 = row.get("#country+code").upper()
if countryiso3s and countryiso3 not in countryiso3s:
continue
pcode = row.get("#adm+code").upper()
self.pcodes.append(pcode)
self.pcode_lengths[countryiso3] = len(pcode)
adm_name = row.get("#adm+name")
self.pcode_to_name[pcode] = adm_name
name_to_pcode = self.name_to_pcode.get(countryiso3, {})
name_to_pcode[unidecode(adm_name).lower()] = pcode
self.name_to_pcode[countryiso3] = name_to_pcode
self.pcode_to_iso3[pcode] = countryiso3
parent = row.get("#adm+code+parent")
self.setup_row(countryiso3, pcode, adm_name, parent)

def setup_from_url(
self,
Expand Down Expand Up @@ -304,19 +339,22 @@ def init_matches_errors(self) -> None:
self.errors = set()

def convert_admin_pcode_length(
self, countryiso3: str, pcode: str, logname: Optional[str] = None
self, countryiso3: str, pcode: str, **kwargs: Any
) -> Optional[str]:
"""Standardise pcode length by country and match to an internal pcode.
Requires that p-code formats be loaded (eg. using load_pcode_formats)
Args:
countryiso3 (str): ISO3 country code
pcode (str): P code to match
logname (Optional[str]): Identifying name to use when logging. Defaults to None (don't log).
**kwargs:
parent (Optional[str]): Parent admin code
logname (str): Log using this identifying name. Defaults to not logging.
Returns:
Optional[str]: Matched P code or None if no match
"""
logname = kwargs.get("logname")
match = self.pcode_regex.match(pcode)
if not match:
return None
Expand Down Expand Up @@ -480,30 +518,49 @@ def convert_admin1_pcode_length(
return None

def fuzzy_pcode(
self, countryiso3: str, name: str, logname: Optional[str] = None
self,
countryiso3: str,
name: str,
**kwargs: Any,
) -> Optional[str]:
"""Fuzzy match name to pcode
Args:
countryiso3 (str): Iso3 country code
name (str): Name to match
logname (Optional[str]): Identifying name to use when logging. Defaults to None (don't log).
**kwargs:
parent (Optional[str]): Parent admin code
logname (str): Log using this identifying name. Defaults to not logging.
Returns:
Optional[str]: Matched P code or None if no match
"""
logname = kwargs.get("logname")
if (
self.countries_fuzzy_try is not None
and countryiso3 not in self.countries_fuzzy_try
):
if logname:
self.ignored.add((logname, countryiso3))
return None
name_to_pcode = self.name_to_pcode.get(countryiso3)
if not name_to_pcode:
if logname:
self.errors.add((logname, countryiso3))
return None
if self.use_parent and "parent" in kwargs:
parent = kwargs["parent"]
name_parent_to_pcode = self.name_parent_to_pcode.get(countryiso3)
if not name_parent_to_pcode:
if logname:
self.errors.add((logname, countryiso3))
return None
name_to_pcode = name_parent_to_pcode.get(parent)
if not name_to_pcode:
if logname:
self.errors.add((logname, countryiso3, parent))
return None
else:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if not name_to_pcode:
if logname:
self.errors.add((logname, countryiso3))
return None
adm_name_lookup = clean_name(name)
adm_name_lookup2 = multiple_replace(
adm_name_lookup, self.admin_name_replacements
Expand Down Expand Up @@ -591,15 +648,17 @@ def get_pcode(
countryiso3: str,
name: str,
fuzzy_match: bool = True,
logname: Optional[str] = None,
**kwargs: Any,
) -> Tuple[Optional[str], bool]:
"""Get pcode for a given name
Args:
countryiso3 (str): Iso3 country code
name (str): Name to match
fuzzy_match (bool): Whether to try fuzzy matching. Defaults to True.
logname (Optional[str]): Identifying name to use when logging. Defaults to None (don't log).
**kwargs:
parent (Optional[str]): Parent admin code
logname (str): Log using this identifying name. Defaults to not logging.
Returns:
Tuple[Optional[str], bool]: (Matched P code or None if no match, True if exact match or False if not)
Expand All @@ -614,18 +673,30 @@ def get_pcode(
# name looks like a p-code, but doesn't match p-codes
# so try adjusting p-code length
pcode = self.convert_admin_pcode_length(
countryiso3, pcode, logname
countryiso3, pcode, **kwargs
)
return pcode, True
else:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if name_to_pcode is not None:
pcode = name_to_pcode.get(name.lower())
if pcode:
return pcode, True
if self.use_parent and "parent" in kwargs:
parent = kwargs["parent"]
name_parent_to_pcode = self.name_parent_to_pcode.get(
countryiso3
)
if name_parent_to_pcode:
name_to_pcode = name_parent_to_pcode.get(parent)
if name_to_pcode is not None:
pcode = name_to_pcode.get(name.lower())
if pcode:
return pcode, True
else:
name_to_pcode = self.name_to_pcode.get(countryiso3)
if name_to_pcode is not None:
pcode = name_to_pcode.get(name.lower())
if pcode:
return pcode, True
if not fuzzy_match:
return None, True
pcode = self.fuzzy_pcode(countryiso3, name, logname)
pcode = self.fuzzy_pcode(countryiso3, name, **kwargs)
return pcode, False

def output_matches(self) -> List[str]:
Expand Down
5 changes: 5 additions & 0 deletions tests/fixtures/adminlevel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,8 @@ admin_fuzzy_dont:
- "sahil"
- "say'on"
- "syria"

admin_info_with_parent:
- {pcode: AF0101, name: Kabul, iso3: AFG, parent: AF01}
- {pcode: AF0102, name: Paghman, iso3: AFG, parent: AF01}
- {pcode: AF0201, name: Kabul, iso3: AFG, parent: AF02} # testing purposes
Loading

0 comments on commit a625397

Please sign in to comment.