Skip to content

Commit

Permalink
Merge pull request #26 from GreenBuildingRegistry/ft-updates
Browse files Browse the repository at this point in the history
better message on parse city, state comparisson, zipcode normalization
  • Loading branch information
fablet authored Apr 19, 2023
2 parents d95b475 + 383505b commit a9837ce
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 42 deletions.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ normalized_address_record() uses the included processing functions to remove una

You may supply additional additional processing functions as a list of callable supplied to the addtl_funcs parameter. Any additional functions should take a string address and return a tuple of strings (line1, line2).

Postal codes are normalized to US zip or zip+4 and zero padded as applicable. ie: `2129 => 02129`, `02129-44 => 02129-0044`, `021290044 => 02129-0044`.
However, postal codes that cannot be effectively normalized, such as invalid length or invalid characters, will raise AddressValidationError. ie `12345678901 or 02129- or 02129-0044-123, etc`

Alternately, you may extend the `NormalizeAddress` class to customize the normalization behavior by overriding any of the class' methods.

If your address is in the form of a dict that does not use the keys address_line_1, address_line_2, city, state, and postal_code, you must supply a key map to the addr_map parameter in the format {standard_key: custom_key}
Expand Down
58 changes: 30 additions & 28 deletions scourgify/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def normalize_addr_str(addr_str, # type: str
:type zipcode: str
:param addtl_funcs: optional sequence of funcs that take string for further
processing and return line1 and line2 strings
:type addtl_funcs: Sequence[Callable[str, (str, str)]]
:type addtl_funcs: Sequence[Callable[str, (str)]]
:return: address dict with uppercase parsed and normalized address values.
:rtype: Mapping[str, str]
"""
Expand Down Expand Up @@ -241,7 +241,7 @@ def normalize_addr_str(addr_str, # type: str
# line1 is set to addr_str so complete dict can be passed to error.
line1 = addr_str

addr_rec = dict(
addr_rec = OrderedDict(
address_line_1=line1, address_line_2=line2, city=city,
state=state, postal_code=zipcode
)
Expand Down Expand Up @@ -417,7 +417,11 @@ def get_parsed_values(parsed_addr, orig_val, val_label, orig_addr_str):
val_from_parse = post_clean_addr_str(val_from_parse)
non_null_val_set = {orig_val, val_from_parse} - {None}
if len(non_null_val_set) > 1:
raise AmbiguousAddressError(None, None, orig_addr_str)
msg = (
f'Parsed {val_label} does not align with submitted value: '
f'Parsed: {val_from_parse}. Original: {orig_val}'
)
raise AmbiguousAddressError(None, msg, orig_addr_str)
else:
return non_null_val_set.pop() if non_null_val_set else None

Expand Down Expand Up @@ -698,28 +702,32 @@ class NormalizeAddress(object):

def __init__(self, address, addr_map=None, addtl_funcs=None, strict=None):
self.address = address
self.addr_map = addr_map
self.addtl_funcs = addtl_funcs
self.strict = True if strict is None else strict
if addr_map and not isinstance(self.address, str):
self.address = {
key: self.address.get(val) for key, val in addr_map.items()
}

@staticmethod
def get_normalized_line_1(parsed_addr, line_labels=LINE1_USADDRESS_LABELS):
return get_normalized_line_segment(parsed_addr, line_labels)

@staticmethod
def get_normalized_line_2(parsed_addr, line_labels=LINE2_USADDRESS_LABELS):
return get_normalized_line_segment(parsed_addr, line_labels)

def normalize(self):
if isinstance(self.address, str):
return self.normalize_addr_str(
self.address, addtl_funcs=self.addtl_funcs
)
return self.normalize_addr_str(self.address)
else:
return self.normalize_addr_dict(
self.address, addr_map=self.addr_map,
addtl_funcs=self.addtl_funcs, strict=self.strict
)
return self.normalize_addr_dict()

def normalize_addr_str(self, addr_str, # type: str
line2=None, # type: Optional[str]
city=None, # type: Optional[str]
state=None, # type: Optional[str]
zipcode=None, # type: Optional[str]
addtl_funcs=None
# type: Sequence[Callable[[str,str], str]] # noqa
): # noqa
# get address parsed into usaddress components.
error = None
Expand Down Expand Up @@ -774,20 +782,16 @@ def normalize_addr_str(self, addr_str, # type: str
# assumes if line2 is passed in that it need not be parsed from
# addr_str. Primarily used to allow advanced processing of
# otherwise unparsable addresses.
line2 = line2 if line2 else get_normalized_line_segment(
parsed_addr, LINE2_USADDRESS_LABELS
)
line2 = line2 if line2 else self.get_normalized_line_2(parsed_addr)
line2 = self.post_clean_addr_str(line2)
# line 1 is fully post cleaned in get_normalized_line_segment.
line1 = get_normalized_line_segment(
parsed_addr, LINE1_USADDRESS_LABELS
)
line1 = self.get_normalized_line_1(parsed_addr)
validate_parens_groups_parsed(line1)
else:
# line1 is set to addr_str so complete dict can be passed to error.
line1 = addr_str

addr_rec = dict(
addr_rec = OrderedDict(
address_line_1=line1, address_line_2=line2, city=city,
state=state, postal_code=zipcode
)
Expand All @@ -796,12 +800,10 @@ def normalize_addr_str(self, addr_str, # type: str
else:
return addr_rec

def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None,
strict=True):
if addr_map:
addr_dict = {key: addr_dict.get(val) for key, val in
addr_map.items()}
addr_dict = validate_address_components(addr_dict, strict=strict)
def normalize_addr_dict(self):
addr_dict = validate_address_components(
self.address, strict=self.strict
)

# line 1 and line 2 elements are combined to ensure consistent
# processing whether the line 2 elements are pre-parsed or
Expand All @@ -816,14 +818,14 @@ def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None,
try:
address = self.normalize_addr_str(
addr_str, city=city, state=state,
zipcode=zipcode, addtl_funcs=addtl_funcs
zipcode=zipcode
)
except AddressNormalizationError:
addr_str = get_addr_line_str(
addr_dict, comma_separate=True, addr_parts=ADDRESS_KEYS
)
address = self.normalize_addr_str(
addr_str, city=city, state=state,
zipcode=zipcode, addtl_funcs=addtl_funcs
zipcode=zipcode
)
return address
34 changes: 27 additions & 7 deletions scourgify/tests/test_address_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,24 +579,44 @@ def test_validate_postal_code(self):
"""Test validate_us_postal_code_format"""

with self.assertRaises(AddressValidationError):
zip_plus = '97219-0001-00'
validate_us_postal_code_format(zip_plus, self.address_dict)
zip_five = 'AAAAA'
validate_us_postal_code_format(zip_five, self.address_dict)

with self.assertRaises(AddressValidationError):
zip_five = '97219-AAAA'
validate_us_postal_code_format(zip_five, self.address_dict)

with self.assertRaises(AddressValidationError):
zip_plus = '97219-00'
zip_plus = '97219-000100'
validate_us_postal_code_format(zip_plus, self.address_dict)

with self.assertRaises(AddressValidationError):
zip_plus = '972-0001'
zip_plus = '97219-0001-00'
validate_us_postal_code_format(zip_plus, self.address_dict)

with self.assertRaises(AddressValidationError):
zip_five = '9721900'
validate_us_postal_code_format(zip_five, self.address_dict)

with self.assertRaises(AddressValidationError):
zip_five = '972'
validate_us_postal_code_format(zip_five, self.address_dict)
zip_five = '972'
expected = '00972'
result = validate_us_postal_code_format(zip_five, self.address_dict)
self.assertEqual(expected, result)

zip_plus = '97219-00'
expected = '97219-0000'
result = validate_us_postal_code_format(zip_plus, self.address_dict)
self.assertEqual(expected, result)

zip_plus = '972-0001'
expected = '00972-0001'
result = validate_us_postal_code_format(zip_plus, self.address_dict)
self.assertEqual(expected, result)

zip_plus = '972190001'
expected = '97219-0001'
result = validate_us_postal_code_format(zip_plus, self.address_dict)
self.assertEqual(expected, result)

expected = '97219'
result = validate_us_postal_code_format(expected, self.address_dict)
Expand Down
26 changes: 20 additions & 6 deletions scourgify/validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,28 @@ def validate_us_postal_code_format(postal_code, address):
'US Postal Codes must conform to five-digit Zip or Zip+4 standards.'
)
postal_code = post_clean_addr_str(postal_code)
if '-' in postal_code:
plus_four_code = postal_code.split('-')
if len(plus_four_code) != 2:
plus_four_code = postal_code.split('-')
for code in plus_four_code:
try:
int(code)
except ValueError:
error = True
elif len(plus_four_code[0]) != 5 or len(plus_four_code[1]) != 4:
if not error:
if '-' in postal_code:
if len(postal_code.replace('-', '')) > 9:
error = True
elif len(plus_four_code) != 2:
error = True
else:
postal_code = '-'.join([
plus_four_code[0].zfill(5), plus_four_code[1].zfill(4)
])
elif len(postal_code) == 9:
postal_code = '-'.join([postal_code[:5], postal_code[5:]])
elif len(postal_code) > 5:
error = True
elif len(postal_code) != 5:
error = True
else:
postal_code = postal_code.zfill(5)

if error:
raise AddressValidationError(msg, None, address)
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name=usaddress-scourgify
version=0.3.0
version=0.4.0
description=Clean US addresses following USPS pub 28 and RESO guidelines
author=Fable Turas
author_email[email protected]
Expand Down

0 comments on commit a9837ce

Please sign in to comment.