From 0b6f1ba2b372c108b6716a0f1e21d21b7ba2ff27 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Fri, 22 Nov 2024 14:57:56 -0600 Subject: [PATCH] Avoid DB import bugs by trimming strings at import transform --- openstates/importers/base.py | 24 ++++++++++++++---------- openstates/settings.py | 11 ++++++++++- openstates/utils/transformers.py | 4 ++++ 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/openstates/importers/base.py b/openstates/importers/base.py index 7ab80fb71..1198b0c5f 100644 --- a/openstates/importers/base.py +++ b/openstates/importers/base.py @@ -532,16 +532,20 @@ def apply_transformers( if transformers is None: transformers = self.cached_transformers - for key, key_transformers in transformers.items(): - if key not in data: - continue - if isinstance(key_transformers, list): - for transformer in key_transformers: - data[key] = transformer(data[key]) - elif isinstance(key_transformers, dict): - self.apply_transformers(data[key], key_transformers) - else: - data[key] = key_transformers(data[key]) + if isinstance(data, list): + for data_item in data: + self.apply_transformers(data_item, transformers) + else: + for key, key_transformers in transformers.items(): + if key not in data: + continue + if isinstance(key_transformers, list): + for transformer in key_transformers: + data[key] = transformer(data[key]) + elif isinstance(key_transformers, dict): + self.apply_transformers(data[key], key_transformers) + else: + data[key] = key_transformers(data[key]) return data diff --git a/openstates/settings.py b/openstates/settings.py index e1763ab02..b1551bbdd 100644 --- a/openstates/settings.py +++ b/openstates/settings.py @@ -24,7 +24,16 @@ CACHE_DIR = os.path.join(os.getcwd(), "_cache") SCRAPED_DATA_DIR = os.path.join(os.getcwd(), "_data") -IMPORT_TRANSFORMERS = {"bill": {"identifier": transformers.fix_bill_id}} +IMPORT_TRANSFORMERS = { + "bill": { + "identifier": transformers.fix_bill_id, + "documents": {"note": transformers.truncate_300}, # TODO remove when db migration done + "versions": {"note": transformers.truncate_300}, # TODO remove when db migration done + }, + "event": { + "media": {"note": transformers.truncate_300}, # TODO remove when db migration done + } +} # Django settings LOGGING = { diff --git a/openstates/utils/transformers.py b/openstates/utils/transformers.py index cab4ef1b8..d82cd3f89 100644 --- a/openstates/utils/transformers.py +++ b/openstates/utils/transformers.py @@ -19,3 +19,7 @@ def fix_bill_id(bill_id: str) -> str: def collapse_whitespace(value: str) -> str: return _whitespace_re.sub(" ", value) + + +def truncate_300(value: str) -> str: + return value[:300]