Skip to content

Commit

Permalink
Encorceing output_table does not contain "__" (#571)
Browse files Browse the repository at this point in the history
* Encorceing output_table does not contain "__"

We use this special string to mark sharded output table, for example if the give output table is:
`project_id:dataset_id.table_id`

and in the sharding config file we have 3 shards with these suffixes:
* `chr1`
* `chr2`
* `residual`

then we will have these 4 outputs:

* `project_id:dataset_id.table_id__chr1`
* `project_id:dataset_id.table_id__chr2`
* `project_id:dataset_id.table_id__residual`
* `project_id:dataset_id.table_id__sample_info_table`

That's why we don't allow "__" to be used in the base table name.

* Review comments
  • Loading branch information
samanvp authored Mar 31, 2020
1 parent b569e63 commit cae7031
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 0 deletions.
2 changes: 2 additions & 0 deletions gcp_variant_transforms/libs/bigquery_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,8 @@ def compose_table_name(base_name, suffix, is_sample=False):
SAMPLE_TABLE_SUFFIX_SEPARATOR if is_sample else TABLE_SUFFIX_SEPARATOR)
return separator.join([base_name, suffix])

def get_table_base_name(table_name):
return table_name.split(TABLE_SUFFIX_SEPARATOR)[0]

class LoadAvro(object):
def __init__(self,
Expand Down
22 changes: 22 additions & 0 deletions gcp_variant_transforms/libs/bigquery_util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,28 @@ def test_raise_error_if_dataset_not_exists(self):
bigquery_util.raise_error_if_dataset_not_exists,
client, 'project', 'dataset')

def test_get_table_base_name(self):
without_suffix1 = 'project_id.dataset_id.table_id'
without_suffix2 = 'project_id:dataset_id.table_id'
self.assertEqual(without_suffix1,
bigquery_util.get_table_base_name(without_suffix1))
self.assertEqual(without_suffix2,
bigquery_util.get_table_base_name(without_suffix2))

with_suffix1 = without_suffix1 + '___chr1'
with_suffix2 = without_suffix2 + '___chr1'
self.assertEqual(without_suffix1,
bigquery_util.get_table_base_name(with_suffix1))
self.assertEqual(without_suffix2,
bigquery_util.get_table_base_name(with_suffix2))

with_two_suffixes1 = with_suffix1 + '___extra_suffix'
with_two_suffixes2 = with_suffix2 + '___extra_suffix'
self.assertEqual(without_suffix1,
bigquery_util.get_table_base_name(with_two_suffixes1))
self.assertEqual(without_suffix2,
bigquery_util.get_table_base_name(with_two_suffixes2))

def test_calculate_optimal_partition_size(self):
total_base_pairs_to_expected_partition_size = {
39980000: 10000,
Expand Down
9 changes: 9 additions & 0 deletions gcp_variant_transforms/options/variant_transform_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,11 @@ def validate(self, parsed_args, client=None):
'--sharding_config_path must point to a valid config file.')
# Ensuring (not) existence of output tables is aligned with --append value.
if parsed_args.output_table:
if (parsed_args.output_table !=
bigquery_util.get_table_base_name(parsed_args.output_table)):
raise ValueError(('Output table cannot contain "{}" we reserve this '
'string to mark sharded output tables.').format(
bigquery_util.TABLE_SUFFIX_SEPARATOR))
if not client:
credentials = GoogleCredentials.get_application_default().create_scoped(
['https://www.googleapis.com/auth/bigquery'])
Expand All @@ -210,6 +215,10 @@ def validate(self, parsed_args, client=None):
num_shards -= 1
for i in range(num_shards):
table_suffix = sharding.get_output_table_suffix(i)
if table_suffix != bigquery_util.get_table_base_name(table_suffix):
raise ValueError(('Table suffix cannot contain "{}" we reserve this '
'string to mark sharded output tables.').format(
bigquery_util.TABLE_SUFFIX_SEPARATOR))
all_output_tables.append(bigquery_util.compose_table_name(table_id,
table_suffix))

Expand Down

0 comments on commit cae7031

Please sign in to comment.