From 960c4b76193d96a447b7638725490b98993fd0b3 Mon Sep 17 00:00:00 2001 From: alexobaseki Date: Wed, 25 Dec 2024 11:59:22 -0500 Subject: [PATCH 1/3] Clean up phone --- openstates/cli/convert_us.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/openstates/cli/convert_us.py b/openstates/cli/convert_us.py index 5ab1208c1..449fefe69 100644 --- a/openstates/cli/convert_us.py +++ b/openstates/cli/convert_us.py @@ -1,3 +1,4 @@ +import re import typing import uuid from collections import defaultdict @@ -28,7 +29,22 @@ def make_org_id(id_: str) -> str: return "ocd-organization/" + str(uuid.uuid5(US_UUID_NAMESPACE, id_)) +def sanitize_phone(phone: str) -> str: + """Remove trail text, toll-free phone number or N/A""" + if phone.lower() in ["n/a", "same as above"]: + return "" + + pattern = r"\((\d{3})\)\s*(\d{3})-(\d{4})" + match = re.search(pattern, phone) + if match: + # Format the first matched number as XXX-XXX-XXXX + formatted_number = f"{match.group(1)}-{match.group(2)}-{match.group(3)}" + return formatted_number + return phone + + def _fix_bad_dashes(phone: str) -> str: + phone = sanitize_phone(phone) return phone.replace("–", "-") @@ -43,7 +59,6 @@ def get_district_offices() -> defaultdict[str, list[Office]]: if office.get("suite"): address += " " + office["suite"] address += f"; {office['city']}, {office['state']} {office['zip']}" - district_offices[entry["id"]["bioguide"]].append( Office( classification="district", From f80411f7729e34576e17b7f8e9bf3014c3c4ff5b Mon Sep 17 00:00:00 2001 From: alexobaseki Date: Wed, 25 Dec 2024 12:10:23 -0500 Subject: [PATCH 2/3] add some comments --- openstates/cli/convert_us.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openstates/cli/convert_us.py b/openstates/cli/convert_us.py index 449fefe69..c0fe433da 100644 --- a/openstates/cli/convert_us.py +++ b/openstates/cli/convert_us.py @@ -33,7 +33,7 @@ def sanitize_phone(phone: str) -> str: """Remove trail text, toll-free phone number or N/A""" if phone.lower() in ["n/a", "same as above"]: return "" - + # Some phone might appear like (123) 456 7890 pattern = r"\((\d{3})\)\s*(\d{3})-(\d{4})" match = re.search(pattern, phone) if match: @@ -59,6 +59,7 @@ def get_district_offices() -> defaultdict[str, list[Office]]: if office.get("suite"): address += " " + office["suite"] address += f"; {office['city']}, {office['state']} {office['zip']}" + district_offices[entry["id"]["bioguide"]].append( Office( classification="district", From 9a67e1e6656c3fa9187a891064d1706a4ebf3119 Mon Sep 17 00:00:00 2001 From: alexobaseki Date: Fri, 27 Dec 2024 15:46:13 -0500 Subject: [PATCH 3/3] Bump version to 6.20.13 --- CHANGELOG.md | 4 ++++ pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d3688b64..1e8772fe6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 6.20.13 - Dec 27, 2024 + +* Sanitize phone number for US people scrape. + ## 6.20.12 - Nov 22, 2024 * Use transformers to trim incoming strings at import that are too long for DB columns: diff --git a/pyproject.toml b/pyproject.toml index d843ac486..fcc15d276 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "openstates" -version = "6.20.12" +version = "6.20.13" description = "core infrastructure for the openstates project" authors = ["James Turk "] license = "MIT"