From cae70317f4a397ae2b245d780d0bca0cf062944a Mon Sep 17 00:00:00 2001 From: Saman Vaisipour Date: Tue, 31 Mar 2020 01:19:53 -0400 Subject: [PATCH] Encorceing output_table does not contain "__" (#571) * Encorceing output_table does not contain "__" We use this special string to mark sharded output table, for example if the give output table is: `project_id:dataset_id.table_id` and in the sharding config file we have 3 shards with these suffixes: * `chr1` * `chr2` * `residual` then we will have these 4 outputs: * `project_id:dataset_id.table_id__chr1` * `project_id:dataset_id.table_id__chr2` * `project_id:dataset_id.table_id__residual` * `project_id:dataset_id.table_id__sample_info_table` That's why we don't allow "__" to be used in the base table name. * Review comments --- gcp_variant_transforms/libs/bigquery_util.py | 2 ++ .../libs/bigquery_util_test.py | 22 +++++++++++++++++++ .../options/variant_transform_options.py | 9 ++++++++ 3 files changed, 33 insertions(+) diff --git a/gcp_variant_transforms/libs/bigquery_util.py b/gcp_variant_transforms/libs/bigquery_util.py index 9f138cd4c..c4a018baa 100644 --- a/gcp_variant_transforms/libs/bigquery_util.py +++ b/gcp_variant_transforms/libs/bigquery_util.py @@ -379,6 +379,8 @@ def compose_table_name(base_name, suffix, is_sample=False): SAMPLE_TABLE_SUFFIX_SEPARATOR if is_sample else TABLE_SUFFIX_SEPARATOR) return separator.join([base_name, suffix]) +def get_table_base_name(table_name): + return table_name.split(TABLE_SUFFIX_SEPARATOR)[0] class LoadAvro(object): def __init__(self, diff --git a/gcp_variant_transforms/libs/bigquery_util_test.py b/gcp_variant_transforms/libs/bigquery_util_test.py index bd523dd40..40b184129 100644 --- a/gcp_variant_transforms/libs/bigquery_util_test.py +++ b/gcp_variant_transforms/libs/bigquery_util_test.py @@ -460,6 +460,28 @@ def test_raise_error_if_dataset_not_exists(self): bigquery_util.raise_error_if_dataset_not_exists, client, 'project', 'dataset') + def test_get_table_base_name(self): + without_suffix1 = 'project_id.dataset_id.table_id' + without_suffix2 = 'project_id:dataset_id.table_id' + self.assertEqual(without_suffix1, + bigquery_util.get_table_base_name(without_suffix1)) + self.assertEqual(without_suffix2, + bigquery_util.get_table_base_name(without_suffix2)) + + with_suffix1 = without_suffix1 + '___chr1' + with_suffix2 = without_suffix2 + '___chr1' + self.assertEqual(without_suffix1, + bigquery_util.get_table_base_name(with_suffix1)) + self.assertEqual(without_suffix2, + bigquery_util.get_table_base_name(with_suffix2)) + + with_two_suffixes1 = with_suffix1 + '___extra_suffix' + with_two_suffixes2 = with_suffix2 + '___extra_suffix' + self.assertEqual(without_suffix1, + bigquery_util.get_table_base_name(with_two_suffixes1)) + self.assertEqual(without_suffix2, + bigquery_util.get_table_base_name(with_two_suffixes2)) + def test_calculate_optimal_partition_size(self): total_base_pairs_to_expected_partition_size = { 39980000: 10000, diff --git a/gcp_variant_transforms/options/variant_transform_options.py b/gcp_variant_transforms/options/variant_transform_options.py index c861812cc..d3391ab26 100644 --- a/gcp_variant_transforms/options/variant_transform_options.py +++ b/gcp_variant_transforms/options/variant_transform_options.py @@ -189,6 +189,11 @@ def validate(self, parsed_args, client=None): '--sharding_config_path must point to a valid config file.') # Ensuring (not) existence of output tables is aligned with --append value. if parsed_args.output_table: + if (parsed_args.output_table != + bigquery_util.get_table_base_name(parsed_args.output_table)): + raise ValueError(('Output table cannot contain "{}" we reserve this ' + 'string to mark sharded output tables.').format( + bigquery_util.TABLE_SUFFIX_SEPARATOR)) if not client: credentials = GoogleCredentials.get_application_default().create_scoped( ['https://www.googleapis.com/auth/bigquery']) @@ -210,6 +215,10 @@ def validate(self, parsed_args, client=None): num_shards -= 1 for i in range(num_shards): table_suffix = sharding.get_output_table_suffix(i) + if table_suffix != bigquery_util.get_table_base_name(table_suffix): + raise ValueError(('Table suffix cannot contain "{}" we reserve this ' + 'string to mark sharded output tables.').format( + bigquery_util.TABLE_SUFFIX_SEPARATOR)) all_output_tables.append(bigquery_util.compose_table_name(table_id, table_suffix))