Merge pull request #26 from GreenBuildingRegistry/ft-updates

better message on parse city, state comparisson, zipcode normalization
GreenBuildingRegistry · Apr 19, 2023 · a9837ce · a9837ce
2 parents d95b475 + 383505b
commit a9837ce
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 42 deletions.
diff --git a/README.rst b/README.rst
@@ -58,6 +58,9 @@ normalized_address_record() uses the included processing functions to remove una
 
 You may supply additional additional processing functions as a list of callable supplied to the addtl_funcs parameter. Any additional functions should take a string address and return a tuple of strings (line1, line2).
 
+Postal codes are normalized to US zip or zip+4 and zero padded as applicable.  ie: `2129 => 02129`, `02129-44 => 02129-0044`, `021290044 => 02129-0044`.
+However, postal codes that cannot be effectively normalized, such as invalid length or invalid characters, will raise AddressValidationError. ie `12345678901 or 02129- or 02129-0044-123, etc`
+
 Alternately, you may extend the `NormalizeAddress` class to customize the normalization behavior by overriding any of the class' methods.
 
 If your address is in the form of a dict that does not use the keys address_line_1, address_line_2, city, state, and postal_code, you must supply a key map to the addr_map parameter in the format {standard_key: custom_key}

diff --git a/scourgify/normalize.py b/scourgify/normalize.py
@@ -182,7 +182,7 @@ def normalize_addr_str(addr_str,         # type: str
     :type zipcode: str
     :param addtl_funcs: optional sequence of funcs that take string for further
         processing and return line1 and line2 strings
-    :type addtl_funcs: Sequence[Callable[str, (str, str)]]
+    :type addtl_funcs: Sequence[Callable[str, (str)]]
     :return: address dict with uppercase parsed and normalized address values.
     :rtype: Mapping[str, str]
     """
@@ -241,7 +241,7 @@ def normalize_addr_str(addr_str,         # type: str
         # line1 is set to addr_str so complete dict can be passed to error.
         line1 = addr_str
 
-    addr_rec = dict(
+    addr_rec = OrderedDict(
         address_line_1=line1, address_line_2=line2, city=city,
         state=state, postal_code=zipcode
     )
@@ -417,7 +417,11 @@ def get_parsed_values(parsed_addr, orig_val, val_label, orig_addr_str):
     val_from_parse = post_clean_addr_str(val_from_parse)
     non_null_val_set = {orig_val, val_from_parse} - {None}
     if len(non_null_val_set) > 1:
-        raise AmbiguousAddressError(None, None, orig_addr_str)
+        msg = (
+            f'Parsed {val_label} does not align with submitted value: '
+            f'Parsed: {val_from_parse}. Original: {orig_val}'
+        )
+        raise AmbiguousAddressError(None, msg, orig_addr_str)
     else:
         return non_null_val_set.pop() if non_null_val_set else None
 
@@ -698,28 +702,32 @@ class NormalizeAddress(object):
 
     def __init__(self, address, addr_map=None, addtl_funcs=None, strict=None):
         self.address = address
-        self.addr_map = addr_map
         self.addtl_funcs = addtl_funcs
         self.strict = True if strict is None else strict
+        if addr_map and not isinstance(self.address, str):
+            self.address = {
+                key: self.address.get(val) for key, val in addr_map.items()
+            }
+
+    @staticmethod
+    def get_normalized_line_1(parsed_addr, line_labels=LINE1_USADDRESS_LABELS):
+        return get_normalized_line_segment(parsed_addr, line_labels)
+
+    @staticmethod
+    def get_normalized_line_2(parsed_addr, line_labels=LINE2_USADDRESS_LABELS):
+        return get_normalized_line_segment(parsed_addr, line_labels)
 
     def normalize(self):
         if isinstance(self.address, str):
-            return self.normalize_addr_str(
-                self.address, addtl_funcs=self.addtl_funcs
-            )
+            return self.normalize_addr_str(self.address)
         else:
-            return self.normalize_addr_dict(
-                self.address, addr_map=self.addr_map,
-                addtl_funcs=self.addtl_funcs, strict=self.strict
-            )
+            return self.normalize_addr_dict()
 
     def normalize_addr_str(self, addr_str,  # type: str
                            line2=None,  # type: Optional[str]
                            city=None,  # type: Optional[str]
                            state=None,  # type: Optional[str]
                            zipcode=None,  # type: Optional[str]
-                           addtl_funcs=None
-                           # type: Sequence[Callable[[str,str], str]]  # noqa
                            ):  # noqa
         # get address parsed into usaddress components.
         error = None
@@ -774,20 +782,16 @@ def normalize_addr_str(self, addr_str,  # type: str
             # assumes if line2 is passed in that it need not be parsed from
             # addr_str. Primarily used to allow advanced processing of
             # otherwise unparsable addresses.
-            line2 = line2 if line2 else get_normalized_line_segment(
-                parsed_addr, LINE2_USADDRESS_LABELS
-            )
+            line2 = line2 if line2 else self.get_normalized_line_2(parsed_addr)
             line2 = self.post_clean_addr_str(line2)
             # line 1 is fully post cleaned in get_normalized_line_segment.
-            line1 = get_normalized_line_segment(
-                parsed_addr, LINE1_USADDRESS_LABELS
-            )
+            line1 = self.get_normalized_line_1(parsed_addr)
             validate_parens_groups_parsed(line1)
         else:
             # line1 is set to addr_str so complete dict can be passed to error.
             line1 = addr_str
 
-        addr_rec = dict(
+        addr_rec = OrderedDict(
             address_line_1=line1, address_line_2=line2, city=city,
             state=state, postal_code=zipcode
         )
@@ -796,12 +800,10 @@ def normalize_addr_str(self, addr_str,  # type: str
         else:
             return addr_rec
 
-    def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None,
-                            strict=True):
-        if addr_map:
-            addr_dict = {key: addr_dict.get(val) for key, val in
-                         addr_map.items()}
-        addr_dict = validate_address_components(addr_dict, strict=strict)
+    def normalize_addr_dict(self):
+        addr_dict = validate_address_components(
+            self.address, strict=self.strict
+        )
 
         # line 1 and line 2 elements are combined to ensure consistent
         # processing whether the line 2 elements are pre-parsed or
@@ -816,14 +818,14 @@ def normalize_addr_dict(self, addr_dict, addr_map=None, addtl_funcs=None,
         try:
             address = self.normalize_addr_str(
                 addr_str, city=city, state=state,
-                zipcode=zipcode, addtl_funcs=addtl_funcs
+                zipcode=zipcode
             )
         except AddressNormalizationError:
             addr_str = get_addr_line_str(
                 addr_dict, comma_separate=True, addr_parts=ADDRESS_KEYS
             )
             address = self.normalize_addr_str(
                 addr_str, city=city, state=state,
-                zipcode=zipcode, addtl_funcs=addtl_funcs
+                zipcode=zipcode
             )
         return address
diff --git a/scourgify/tests/test_address_normalization.py b/scourgify/tests/test_address_normalization.py
@@ -579,24 +579,44 @@ def test_validate_postal_code(self):
         """Test validate_us_postal_code_format"""
 
         with self.assertRaises(AddressValidationError):
-            zip_plus = '97219-0001-00'
-            validate_us_postal_code_format(zip_plus, self.address_dict)
+            zip_five = 'AAAAA'
+            validate_us_postal_code_format(zip_five, self.address_dict)
+
+        with self.assertRaises(AddressValidationError):
+            zip_five = '97219-AAAA'
+            validate_us_postal_code_format(zip_five, self.address_dict)
 
         with self.assertRaises(AddressValidationError):
-            zip_plus = '97219-00'
+            zip_plus = '97219-000100'
             validate_us_postal_code_format(zip_plus, self.address_dict)
 
         with self.assertRaises(AddressValidationError):
-            zip_plus = '972-0001'
+            zip_plus = '97219-0001-00'
             validate_us_postal_code_format(zip_plus, self.address_dict)
 
         with self.assertRaises(AddressValidationError):
             zip_five = '9721900'
             validate_us_postal_code_format(zip_five, self.address_dict)
 
-        with self.assertRaises(AddressValidationError):
-            zip_five = '972'
-            validate_us_postal_code_format(zip_five, self.address_dict)
+        zip_five = '972'
+        expected = '00972'
+        result = validate_us_postal_code_format(zip_five, self.address_dict)
+        self.assertEqual(expected, result)
+
+        zip_plus = '97219-00'
+        expected = '97219-0000'
+        result = validate_us_postal_code_format(zip_plus, self.address_dict)
+        self.assertEqual(expected, result)
+
+        zip_plus = '972-0001'
+        expected = '00972-0001'
+        result = validate_us_postal_code_format(zip_plus, self.address_dict)
+        self.assertEqual(expected, result)
+
+        zip_plus = '972190001'
+        expected = '97219-0001'
+        result = validate_us_postal_code_format(zip_plus, self.address_dict)
+        self.assertEqual(expected, result)
 
         expected = '97219'
         result = validate_us_postal_code_format(expected, self.address_dict)

diff --git a/scourgify/validations.py b/scourgify/validations.py
@@ -97,14 +97,28 @@ def validate_us_postal_code_format(postal_code, address):
         'US Postal Codes must conform to five-digit Zip or Zip+4 standards.'
     )
     postal_code = post_clean_addr_str(postal_code)
-    if '-' in postal_code:
-        plus_four_code = postal_code.split('-')
-        if len(plus_four_code) != 2:
+    plus_four_code = postal_code.split('-')
+    for code in plus_four_code:
+        try:
+            int(code)
+        except ValueError:
             error = True
-        elif len(plus_four_code[0]) != 5 or len(plus_four_code[1]) != 4:
+    if not error:
+        if '-' in postal_code:
+            if len(postal_code.replace('-', '')) > 9:
+                error = True
+            elif len(plus_four_code) != 2:
+                error = True
+            else:
+                postal_code = '-'.join([
+                    plus_four_code[0].zfill(5), plus_four_code[1].zfill(4)
+                ])
+        elif len(postal_code) == 9:
+            postal_code = '-'.join([postal_code[:5], postal_code[5:]])
+        elif len(postal_code) > 5:
             error = True
-    elif len(postal_code) != 5:
-        error = True
+        else:
+            postal_code = postal_code.zfill(5)
 
     if error:
         raise AddressValidationError(msg, None, address)

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name=usaddress-scourgify
-version=0.3.0
+version=0.4.0
 description=Clean US addresses following USPS pub 28 and RESO guidelines
 author=Fable Turas
 author_email[email protected]