From 79c7ef2573927c98f293a0d172c1645170eff7c0 Mon Sep 17 00:00:00 2001 From: Brian Park Date: Thu, 2 Nov 2023 08:47:17 -0700 Subject: [PATCH] generate_schema.py: fix schema amnesia which prints multiple type mismatch warnings (see #98) --- CHANGELOG.md | 18 +++ bigquery_schema_generator/generate_schema.py | 49 ++++-- tests/testdata.txt | 20 +++ tests/testdata/issue98.fmt.json | 150 +++++++++++++++++++ tests/testdata/issue98.json | 6 + 5 files changed, 231 insertions(+), 12 deletions(-) create mode 100644 tests/testdata/issue98.fmt.json create mode 100644 tests/testdata/issue98.json diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a00241..f2087cb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,24 @@ # Changelog * Unreleased + * **Bug Fix**: Prevent amnesia that causes multiple type mismatches warnings + * If a data set contains multiple records with a column which do not + match each other, then the old code would *remove* the corresponding + internal `schema_entry` for that column, and print a warning message. + * This means that subsequent records would recreate the `schema_entry`, + and a subsequent mismatch would print another warning message. + * This also meant that if there was a second record after the most + recent mismatch, the script would output a schema entry for the + mismatching column, corresponding to the type of the last record which + was not marked as a mismatch. + * The fix is to use a tombstone entry for the offending column, instead + of deleting the `schema_entry` completely. Only a single warning + message is printed, and the column is ignored for all subsequent + records in the input data set. + * See + [Issue#98](https://github.com/bxparks/bigquery-schema-generator/issues/98] + which identified this problem which seems to have existed from the + very beginning. * 1.6.0 (2023-04-01) * Allow `null` fields to convert to `REPEATED` because `bq load` seems to interpret null fields to be equivalent to an empty array `[]`. diff --git a/bigquery_schema_generator/generate_schema.py b/bigquery_schema_generator/generate_schema.py index 4a3a816..ed13990 100755 --- a/bigquery_schema_generator/generate_schema.py +++ b/bigquery_schema_generator/generate_schema.py @@ -136,13 +136,15 @@ def deduce_schema(self, input_data, *, schema_map=None): key: schema_entry } - The 'key' is the name of the table column. + The 'key' is the canonical column name, which is set to be the + lower-cased version of the sanitized key because BigQuery is + case-insensitive to its column name. schema_entry := { - 'status': 'hard | soft', + 'status': 'hard | soft | ignore', 'filled': True | False, 'info': { - 'name': key, + 'name': column_name, 'type': 'STRING | TIMESTAMP | DATE | TIME | FLOAT | INTEGER | BOOLEAN | RECORD' 'mode': 'NULLABLE | REQUIRED | REPEATED', @@ -160,6 +162,13 @@ def deduce_schema(self, input_data, *, schema_map=None): 'hard'. The status can transition from 'soft' to 'hard' but not the reverse. + The status of 'ignore' identifies a column where the type of one record + conflicts with the type of another record. The column will be ignored + in the final JSON schema. (Early versions of this script *removed* the + offending column entry completely upon the first mismatch. But that + caused subsequent records to recreate the schema entry, which would be + incorrect.) + The 'filled' entry indicates whether all input data records contained the given field. If the --infer_mode flag is given, the 'filled' entry is used to convert a NULLABLE schema entry to a REQUIRED schema entry or @@ -277,8 +286,7 @@ def merge_schema_entry( Returns the merged schema_entry. This method assumes that both 'old_schema_entry' and 'new_schema_entry' can be modified in place and - returned as the new schema_entry. Returns None if the field should - be removed from the schema due to internal consistency errors. + returned as the new schema_entry. 'base_path' is the string representing the current path within the nested record that leads to this specific entry. This is used during @@ -302,13 +310,19 @@ def merge_schema_entry( old_status = old_schema_entry['status'] new_status = new_schema_entry['status'] - # new 'soft' does not clobber old 'hard' + # If the field was previously determined to be inconsistent, hence set + # to 'ignore', do nothing and return immediately. + if old_status == 'ignore': + return old_schema_entry + + # new 'soft' retains the old 'hard' if old_status == 'hard' and new_status == 'soft': mode = self.merge_mode(old_schema_entry, new_schema_entry, base_path) if mode is None: - return None + old_schema_entry['status'] = 'ignore' + return old_schema_entry old_schema_entry['info']['mode'] = mode return old_schema_entry @@ -318,7 +332,8 @@ def merge_schema_entry( new_schema_entry, base_path) if mode is None: - return None + old_schema_entry['status'] = 'ignore' + return old_schema_entry new_schema_entry['info']['mode'] = mode return new_schema_entry @@ -389,7 +404,8 @@ def merge_schema_entry( new_schema_entry, base_path) if new_mode is None: - return None + old_schema_entry['status'] = 'ignore' + return old_schema_entry new_schema_entry['info']['mode'] = new_mode # For all other types... @@ -402,7 +418,8 @@ def merge_schema_entry( f'old=({old_status},{full_old_name},{old_mode},{old_type});' f' new=({new_status},{full_new_name},{new_mode},{new_type})' ) - return None + old_schema_entry['status'] = 'ignore' + return old_schema_entry new_info['type'] = candidate_type return new_schema_entry @@ -414,6 +431,11 @@ def merge_mode(self, old_schema_entry, new_schema_entry, base_path): flag), because REQUIRED is created only in the flatten_schema() method. Therefore, a NULLABLE->REQUIRED transition cannot occur. + Returns the merged mode. + + Returning None means that the modes of the old_schema and new_schema are + not compatible. + We have the following sub cases for the REQUIRED -> NULLABLE transition: @@ -425,8 +447,6 @@ def merge_mode(self, old_schema_entry, new_schema_entry, base_path): REQUIRED -> NULLABLE transition. b) If --infer_mode is not given, then we log an error and ignore this field from the schema. - - Returning a 'None' causes the field to be dropped from the schema. """ old_info = old_schema_entry['info'] new_info = new_schema_entry['info'] @@ -884,6 +904,11 @@ def flatten_schema_map( filled = meta['filled'] info = meta['info'] + # An 'ignore' status means different records had different types for + # this field, so should be ignored. + if status == 'ignore': + continue + # Schema entries with a status of 'soft' are caused by 'null' or # empty fields. Don't print those out if the 'keep_nulls' flag is # False. diff --git a/tests/testdata.txt b/tests/testdata.txt index bd5c992..d8ce010 100644 --- a/tests/testdata.txt +++ b/tests/testdata.txt @@ -167,6 +167,26 @@ SCHEMA [] END +# If there are multiple records with a column with a mismatched type, only the +# first one mismatch should trigger a warning message. All subsequent records +# should have that column ignored, even if the mismatch occurs multiple times. +# See Issue 98. +# +# Before that fix, the following would generate a warning each time there is a +# transition between matching and mismatching type for the problematic column. +# So for the following 4 records, the previous version would print 2 warnings, +# one for line #2, and one for line #4. +DATA +{ "ts": "2017-05-22T17:10:00-07:00" } +{ "ts": 1.0 } +{ "ts": 2.0 } +{ "ts": "2017-05-22T17:10:00-07:00" } +ERRORS +2: Ignoring field with mismatched type: old=(hard,ts,NULLABLE,TIMESTAMP); new=(hard,ts,NULLABLE,FLOAT) +SCHEMA +[] +END + # DATE cannot change into a non-String type. DATA { "d": "2017-01-01" } diff --git a/tests/testdata/issue98.fmt.json b/tests/testdata/issue98.fmt.json new file mode 100644 index 0000000..e1f66ef --- /dev/null +++ b/tests/testdata/issue98.fmt.json @@ -0,0 +1,150 @@ +{ + "landingPageClicks": 2, + "costInLocalCurrency": 3.211797061516033, + "impressions": 331, + "clicks": 2, + "totalEngagements": 3, + "account_id": 0, + "account_name": "dummy", + "date": "2021-07-10", + "dimensionValue": [ + { + "servingHoldReasons": [ + "STOPPED", + "CAMPAIGN_STOPPED" + ], + "lastModifiedAt": 1656605739000, + "content": { + "reference": "urn:" + }, + "createdAt": 1622099537000, + "review": { + "status": "APPROVED" + }, + "id": "urn:", + "lastModifiedBy": "urn:", + "createdBy": "urn:", + "isTest": false, + "isServing": false, + "campaign": "urn:", + "intendedStatus": "PAUSED", + "account": "urn:" + } + ], + "dimension": "creative" +} +{ + "landingPageClicks": 17, + "pivotValues_": [ + { + "message": "", + "status": 400 + } + ], + "costInLocalCurrency": 25.41813751523781, + "impressions": 2300, + "clicks": 17, + "totalEngagements": 45, + "account_id": 0, + "account_name": "dummy", + "date": "2021-06-03", + "dimensionValue": [ + "urn:" + ], + "dimension": "creative" +} +{ + "landingPageClicks": 11, + "pivotValues_": [ + { + "message": "", + "status": 400 + } + ], + "costInLocalCurrency": 8.094764692519716, + "impressions": 602, + "clicks": 11, + "totalEngagements": 15, + "account_id": 0, + "account_name": "dummy", + "date": "2021-06-27", + "dimensionValue": [ + "urn:" + ], + "dimension": "creative" +} +{ + "landingPageClicks": 10, + "pivotValues_": [ + { + "message": "", + "status": 400 + } + ], + "costInLocalCurrency": 15.095423445027421, + "impressions": 999, + "clicks": 10, + "totalEngagements": 19, + "account_id": 0, + "account_name": "dummy", + "date": "2021-06-06", + "dimensionValue": [ + "urn:" + ], + "dimension": "creative" +} +{ + "landingPageClicks": 19, + "pivotValues_": [ + { + "message": "", + "status": 400 + } + ], + "costInLocalCurrency": 26.39521675040559, + "impressions": 2982, + "clicks": 19, + "totalEngagements": 45, + "account_id": 0, + "account_name": "dummy", + "date": "2021-07-19", + "dimensionValue": [ + "urn:" + ], + "dimension": "creative" +} +{ + "landingPageClicks": 5, + "costInLocalCurrency": 7.54, + "impressions": 430, + "clicks": 5, + "totalEngagements": 12, + "account_id": 0, + "account_name": "dummy", + "date": "2021-12-25", + "dimensionValue": [ + { + "servingHoldReasons": [ + "CAMPAIGN_STOPPED", + "CAMPAIGN_TOTAL_BUDGET_HOLD" + ], + "lastModifiedAt": 1656583173000, + "content": { + "reference": "urn:" + }, + "createdAt": 1640163564000, + "review": { + "status": "APPROVED" + }, + "id": "urn:", + "lastModifiedBy": "urn:", + "createdBy": "urn:", + "isTest": false, + "isServing": false, + "campaign": "urn:", + "intendedStatus": "ACTIVE", + "account": "urn:" + } + ], + "dimension": "creative" +} diff --git a/tests/testdata/issue98.json b/tests/testdata/issue98.json new file mode 100644 index 0000000..0e05921 --- /dev/null +++ b/tests/testdata/issue98.json @@ -0,0 +1,6 @@ +{"landingPageClicks":2.0,"costInLocalCurrency":3.211797061516033,"impressions":331.0,"clicks":2.0,"totalEngagements":3.0,"account_id":0,"account_name":"dummy","date":"2021-07-10","dimensionValue":[{"servingHoldReasons":["STOPPED","CAMPAIGN_STOPPED"],"lastModifiedAt":1656605739000.0,"content":{"reference":"urn:"},"createdAt":1622099537000.0,"review":{"status":"APPROVED"},"id":"urn:","lastModifiedBy":"urn:","createdBy":"urn:","isTest":false,"isServing":false,"campaign":"urn:","intendedStatus":"PAUSED","account":"urn:"}],"dimension":"creative"} +{"landingPageClicks":17.0,"pivotValues_":[{"message":"","status":400.0}],"costInLocalCurrency":25.41813751523781,"impressions":2300.0,"clicks":17.0,"totalEngagements":45.0,"account_id":0,"account_name":"dummy","date":"2021-06-03","dimensionValue":["urn:"],"dimension":"creative"} +{"landingPageClicks":11.0,"pivotValues_":[{"message":"","status":400.0}],"costInLocalCurrency":8.094764692519716,"impressions":602.0,"clicks":11.0,"totalEngagements":15.0,"account_id":0,"account_name":"dummy","date":"2021-06-27","dimensionValue":["urn:"],"dimension":"creative"} +{"landingPageClicks":10.0,"pivotValues_":[{"message":"","status":400.0}],"costInLocalCurrency":15.095423445027421,"impressions":999.0,"clicks":10.0,"totalEngagements":19.0,"account_id":0,"account_name":"dummy","date":"2021-06-06","dimensionValue":["urn:"],"dimension":"creative"} +{"landingPageClicks":19.0,"pivotValues_":[{"message":"","status":400.0}],"costInLocalCurrency":26.39521675040559,"impressions":2982.0,"clicks":19.0,"totalEngagements":45.0,"account_id":0,"account_name":"dummy","date":"2021-07-19","dimensionValue":["urn:"],"dimension":"creative"} +{"landingPageClicks":5.0,"costInLocalCurrency":7.54,"impressions":430.0,"clicks":5.0,"totalEngagements":12.0,"account_id":0,"account_name":"dummy","date":"2021-12-25","dimensionValue":[{"servingHoldReasons":["CAMPAIGN_STOPPED","CAMPAIGN_TOTAL_BUDGET_HOLD"],"lastModifiedAt":1656583173000.0,"content":{"reference":"urn:"},"createdAt":1640163564000.0,"review":{"status":"APPROVED"},"id":"urn:","lastModifiedBy":"urn:","createdBy":"urn:","isTest":false,"isServing":false,"campaign":"urn:","intendedStatus":"ACTIVE","account":"urn:"}],"dimension":"creative"}