From ca2aab98960cd827f965e286648e1d5abadb9b79 Mon Sep 17 00:00:00 2001 From: bill-warner <71764352+bill-warner@users.noreply.github.com> Date: Fri, 21 May 2021 12:09:00 +0300 Subject: [PATCH 1/2] Bigquery Web: Add integration tests --- .scripts/README.md | 36 ++++++++++ .scripts/e2e.sh | 1 + .scripts/integration_test.sh | 54 +++++++++++++++ .scripts/pr_check.sh | 1 + .scripts/run_config.sh | 15 +++- .scripts/run_test.sh | 1 + .../web/v1/integration_tests.json | 69 +++++++++++++++++++ .../events_staged_integration_test_1.json | 13 ++++ .../events_staged_integration_test_2.json | 13 ++++ .../events_staged_integration_test_3.json | 13 ++++ .../events_staged_integration_test_4.json | 13 ++++ .../events_staged_integration_test_5.json | 13 ++++ .../perm_integration_test_tables.json | 29 ++++++++ .../web/v1/bigquery_variables.yml.tmpl | 8 +++ 14 files changed, 276 insertions(+), 3 deletions(-) create mode 100755 .scripts/integration_test.sh create mode 100644 .test/great_expectations/expectations/web/v1/integration_tests.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_1.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_2.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_3.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_4.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_5.json create mode 100644 .test/great_expectations/validation_configs/web/v1/bigquery/perm_integration_test_tables.json create mode 100644 .test/integration_tests/web/v1/bigquery_variables.yml.tmpl diff --git a/.scripts/README.md b/.scripts/README.md index 27b39472..3639b88e 100644 --- a/.scripts/README.md +++ b/.scripts/README.md @@ -53,6 +53,7 @@ Note that this script does not enforce dependencies, rather runs the playbooks i -d (dryRun) use sql-runner dry run -o (output path) path to store output of sql-runner to sql file (to be used in conjunction with p) -t (target template) path to target template to use (minimizes risk of credential leak) +-v (variable template) path to variable template. Any variables in this template will override any corresponding variables within each playbook for the run. ``` **Examples:** @@ -178,6 +179,41 @@ bash .scripts/pr_check.sh -b ~/pathTo/sql-runner -d bigquery -m web; # Runs the pr check testing script against bigquery ``` +## integration_test.sh + +Runs 4 end to end runs of the standard model in 1 day increments, using the integration test dataset. The actual derived tables are then checked against the expect derived tables. The standard tests are also performed on the derived tables. + +We recommend using a virtual environment for python, eg. `pyenv` or `virtualenv` - for example using the latter: + +```bash +virtualenv ~/myenv +source ~/myenv/bin/activate +``` + +Before running, make sure to install python requirements (python3 required): + +```bash +cd data-models/.test +pip3 install -r requirements.txt +``` + +**Arguments:** + +``` +-b (binary) path to sql-runner binary [required] +-d (database) target database for expectations [required] +-a (auth) optional credentials for database target +-m (model) target model to run i.e. web or mobile [required] +``` + +**Examples:** + +```bash +bash .scripts/integration_test.sh -b ~/pathTo/sql-runner -d bigquery -m web + +# Runs the integration testing script against bigquery +``` + ### `run_playbooks.sh` (deprecated) Deprecated - `run_config.sh` provides a simpler instrumentation for this functionality. diff --git a/.scripts/e2e.sh b/.scripts/e2e.sh index 6eb1ca56..abc564eb 100755 --- a/.scripts/e2e.sh +++ b/.scripts/e2e.sh @@ -4,6 +4,7 @@ # -b (binary) path to sql-runner binary # -d (database) target database for expectations # -a (auth) optional credentials for database target +# -m (model) target model to run i.e. web or mobile while getopts 'b:d:a:m:' v do diff --git a/.scripts/integration_test.sh b/.scripts/integration_test.sh new file mode 100755 index 00000000..cc2d6ce4 --- /dev/null +++ b/.scripts/integration_test.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Expected input: +# -b (binary) path to sql-runner binary +# -d (database) target database for expectations +# -a (auth) optional credentials for database target +# -m (model) target model to run i.e. web or mobile + +while getopts 'b:d:a:m:' opt +do + case $opt in + b) SQL_RUNNER_PATH=$OPTARG ;; + d) DATABASE=$OPTARG ;; + a) CREDENTIALS=$OPTARG ;; + m) MODEL=$OPTARG ;; + esac +done + +repo_root_path=$( cd "$(dirname "$(dirname "${BASH_SOURCE[0]}")")" && pwd -P ) +script_path="${repo_root_path}/.scripts" +config_dir="${repo_root_path}/$MODEL/v1/$DATABASE/sql-runner/configs" + +# Set credentials via env vars +export BIGQUERY_CREDS=${BIGQUERY_CREDS:-$CREDENTIALS} +export REDSHIFT_PASSWORD=${REDSHIFT_PASSWORD:-$CREDENTIALS} +export SNOWFLAKE_PASSWORD=${SNOWFLAKE_PASSWORD:-$CREDENTIALS} + +echo "integration_check: Starting 5 runs" + +for i in {1..5}; do + + echo "integration_check: Starting run $i"; + + bash .scripts/run_config.sh -b sql-runner -c $config_dir/pre_test.json -t $script_path/templates/$DATABASE.yml.tmpl -v .test/integration_tests/$MODEL/v1/${DATABASE}_variables.yml.tmpl || exit; + + echo "integration_check: Checking actual vs. expected for the events_staged table"; + + bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c events_staged_integration_test_${i} || exit 1; + + bash .scripts/run_config.sh -b sql-runner -c $config_dir/post_test.json -t $script_path/templates/$DATABASE.yml.tmpl -v .test/integration_tests/$MODEL/v1/${DATABASE}_variables.yml.tmpl || exit; + + echo "integration_check: run $i done"; + +done || exit 1 + +echo "integration_check: Checking actual vs. expected for derived tables"; + +bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c perm_integration_test_tables || exit 1; + +echo "integration_check: Checking standard tests against derived tables"; + +bash $script_path/run_test.sh -m $MODEL -d $DATABASE -c perm_tables || exit 1; + +echo "integration_check: Done" diff --git a/.scripts/pr_check.sh b/.scripts/pr_check.sh index 675aa917..4104d71d 100755 --- a/.scripts/pr_check.sh +++ b/.scripts/pr_check.sh @@ -4,6 +4,7 @@ # -b (binary) path to sql-runner binary # -d (database) target database for expectations # -a (auth) optional credentials for database target +# -m (model) target model to run i.e. web or mobile while getopts 'b:d:a:m:' v do diff --git a/.scripts/run_config.sh b/.scripts/run_config.sh index 1fb20f01..18705a1a 100755 --- a/.scripts/run_config.sh +++ b/.scripts/run_config.sh @@ -8,17 +8,19 @@ # -d (dryRun) use sql-runner dry run # -o (output path) path to store output of sql-runner to sql file (to be used in conjunction with p) # -t (target template) path to target template to use (minimizes risk of credential leak) +# -v (varialbles template) path to variables template to use -while getopts 'pdb:c:a:o:t:' v +while getopts 'pdb:c:a:o:t:v:' opt do - case $v in + case $opt in b) SQL_RUNNER_PATH=$OPTARG ;; c) CONFIG_PATH=$OPTARG ;; a) CREDENTIALS=$OPTARG ;; p) FILL_TEMPLATES='-fillTemplates' ;; d) DRY_RUN='-dryRun' ;; o) OUTPUT_PATH=$OPTARG ;; - t) TARGET_TEMPLATE=$OPTARG + t) TARGET_TEMPLATE=$OPTARG ;; + v) VARIABLES_TEMPLATE=$OPTARG esac done @@ -72,6 +74,13 @@ do fi + if [ ! -z "$VARIABLES_TEMPLATE" ]; then + + # Sub in any variables if specified + awk -F':' 'NR==FNR{a[$2]=$0;next} /:variables:/{flag=1} /:steps:/{flag=0} a[$2]&&flag{$0=a[$2]}1' $root_path/$VARIABLES_TEMPLATE $root_path/tmp/current_playbook.yml > $root_path/tmp/current_playbook.tmp && mv $root_path/tmp/current_playbook.tmp $root_path/tmp/current_playbook.yml + + fi + # If printing sql to file, mkdirs and set path vars if [ ! -z "$OUTPUT_PATH" ]; then mkdir -p $OUTPUT_PATH diff --git a/.scripts/run_test.sh b/.scripts/run_test.sh index ce882ffb..2b169379 100755 --- a/.scripts/run_test.sh +++ b/.scripts/run_test.sh @@ -4,6 +4,7 @@ # -d (database) target database for expectations # -c (config) expectation config name # -a (auth) optional credentials for database target +# -m (model) target model to run i.e. web or mobile while getopts 'd:c:a:m:' v do diff --git a/.test/great_expectations/expectations/web/v1/integration_tests.json b/.test/great_expectations/expectations/web/v1/integration_tests.json new file mode 100644 index 00000000..61c74508 --- /dev/null +++ b/.test/great_expectations/expectations/web/v1/integration_tests.json @@ -0,0 +1,69 @@ +{ + "data_asset_type": "Dataset", + "expectation_suite_name": "integration_tests", + "expectations": [ + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "long_session" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "null_page_view_id" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "null_domain_userid" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "null_domain_sessionid" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "dupe_event_id_same_collector_tstamp" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "dupe_event_id_diff_collector_tstamp" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "dupe_page_view_id_diff_derived_tstamp" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "late_arriving_dvc_created_sent" + } + }, + { + "expectation_type": "expect_column_values_to_be_null", + "kwargs": { + "column": "clean_session" + } + } + ], + "meta": { + "versions": { + "test_suite_version": "1.1.0", + "bigquery_model_version": "1.0.3" + }, + "__comment__": "expect_column_values_to_be_null on column stray_page_ping has been removed as it is a known issue (https://github.com/snowplow/data-models/issues/92)", + "great_expectations.__version__": "0.12.0" + } +} + diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_1.json b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_1.json new file mode 100644 index 00000000..a967ee5d --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_1.json @@ -0,0 +1,13 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_1 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_events_staged_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_2.json b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_2.json new file mode 100644 index 00000000..90bd2188 --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_2.json @@ -0,0 +1,13 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_2 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_events_staged_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_3.json b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_3.json new file mode 100644 index 00000000..d055a24a --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_3.json @@ -0,0 +1,13 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_3 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_events_staged_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_4.json b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_4.json new file mode 100644 index 00000000..a6694e74 --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_4.json @@ -0,0 +1,13 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_4 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_events_staged_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_5.json b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_5.json new file mode 100644 index 00000000..12028b27 --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/events_staged_integration_test_5.json @@ -0,0 +1,13 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.events_staged_run_5 AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM scratch_dev1.events_staged AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_events_staged_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/great_expectations/validation_configs/web/v1/bigquery/perm_integration_test_tables.json b/.test/great_expectations/validation_configs/web/v1/bigquery/perm_integration_test_tables.json new file mode 100644 index 00000000..81190524 --- /dev/null +++ b/.test/great_expectations/validation_configs/web/v1/bigquery/perm_integration_test_tables.json @@ -0,0 +1,29 @@ +{ + "validation_operator_name": "action_list_operator", + "batches": [ + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.page_views AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM derived_dev1.page_views AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_pv_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + }, + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.sessions AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM derived_dev1.sessions AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_sessions_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + }, + { + "batch_kwargs": { + "datasource": "bigquery", + "query": "WITH expected_hashed AS ( SELECT a AS data, FARM_FINGERPRINT(FORMAT( '%%T', a)) AS h FROM dv_test_data.users AS a ), actual_hashed AS ( SELECT b AS data, FARM_FINGERPRINT(FORMAT( '%%T', b)) AS h FROM derived_dev1.users AS b ), equality_check AS ( SELECT IF(l.h IS NULL, 'New on right', 'New on left') AS Change, IF(l.h IS NULL,r.data,l.data).* FROM expected_hashed l FULL OUTER JOIN actual_hashed r ON l.h = r.h WHERE l.h IS NULL OR r.h IS NULL)SELECT SUM(CASE WHEN user_id = 'long session' THEN 1 END) AS long_session, SUM(CASE WHEN user_id = 'NULL page_view_id' THEN 1 END) AS null_page_view_id, SUM(CASE WHEN user_id = 'NULL domain_userid' THEN 1 END) AS null_domain_userid, SUM(CASE WHEN user_id = 'NULL domain_sessionid' THEN 1 END) AS null_domain_sessionid, SUM(CASE WHEN user_id = 'dupe: event_id same collector_tstamp' THEN 1 END) AS dupe_event_id_same_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: event_id different collector_tstamp' THEN 1 END) AS dupe_event_id_diff_collector_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id same derived_tstamp' THEN 1 END) AS dupe_page_view_id_same_derived_tstamp, SUM(CASE WHEN user_id = 'dupe: page_view_id different derived_tstamp' THEN 1 END) AS dupe_page_view_id_diff_derived_tstamp, SUM(CASE WHEN user_id = 'late arriving: device created/sent >3 days' THEN 1 END) AS late_arriving_dvc_created_sent, SUM(CASE WHEN user_id = 'stray page ping' THEN 1 END) AS stray_page_ping, SUM(CASE WHEN user_id = 'No edge cases' THEN 1 END) AS clean_session FROM equality_check", + "bigquery_temp_table": "ge_test_derived_users_integration" + }, + "expectation_suite_names": ["web.v1.integration_tests"] + } + ] +} diff --git a/.test/integration_tests/web/v1/bigquery_variables.yml.tmpl b/.test/integration_tests/web/v1/bigquery_variables.yml.tmpl new file mode 100644 index 00000000..1eab52a4 --- /dev/null +++ b/.test/integration_tests/web/v1/bigquery_variables.yml.tmpl @@ -0,0 +1,8 @@ +:variables: + :input_schema: dv_test_data + :scratch_schema: scratch_dev1 + :output_schema: derived_dev1 + :entropy: "" + :start_date: 2021-03-01 + :update_cadence_days: 1 +:step: From 0be98b51b0c8353b810fb94958f85ba498fb6343 Mon Sep 17 00:00:00 2001 From: bill-warner <71764352+bill-warner@users.noreply.github.com> Date: Thu, 27 May 2021 10:10:51 +0300 Subject: [PATCH 2/2] Prepare for release --- .test/great_expectations/expectations/web/v1/base.json | 2 +- .test/great_expectations/expectations/web/v1/base_redshift.json | 2 +- .../expectations/web/v1/integration_tests.json | 2 +- .test/great_expectations/expectations/web/v1/metadata.json | 2 +- .../expectations/web/v1/page_view_in_session_values.json | 2 +- .test/great_expectations/expectations/web/v1/page_views.json | 2 +- .test/great_expectations/expectations/web/v1/sessions.json | 2 +- .test/great_expectations/expectations/web/v1/users.json | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.test/great_expectations/expectations/web/v1/base.json b/.test/great_expectations/expectations/web/v1/base.json index 79390b6a..89b68216 100644 --- a/.test/great_expectations/expectations/web/v1/base.json +++ b/.test/great_expectations/expectations/web/v1/base.json @@ -137,7 +137,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0" }, diff --git a/.test/great_expectations/expectations/web/v1/base_redshift.json b/.test/great_expectations/expectations/web/v1/base_redshift.json index c4ec971c..9e87a41d 100644 --- a/.test/great_expectations/expectations/web/v1/base_redshift.json +++ b/.test/great_expectations/expectations/web/v1/base_redshift.json @@ -285,7 +285,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0" }, "great_expectations.__version__": "0.12.0" diff --git a/.test/great_expectations/expectations/web/v1/integration_tests.json b/.test/great_expectations/expectations/web/v1/integration_tests.json index 61c74508..ca6d9bfc 100644 --- a/.test/great_expectations/expectations/web/v1/integration_tests.json +++ b/.test/great_expectations/expectations/web/v1/integration_tests.json @@ -59,7 +59,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "bigquery_model_version": "1.0.3" }, "__comment__": "expect_column_values_to_be_null on column stray_page_ping has been removed as it is a known issue (https://github.com/snowplow/data-models/issues/92)", diff --git a/.test/great_expectations/expectations/web/v1/metadata.json b/.test/great_expectations/expectations/web/v1/metadata.json index 3117826a..cdc76817 100644 --- a/.test/great_expectations/expectations/web/v1/metadata.json +++ b/.test/great_expectations/expectations/web/v1/metadata.json @@ -102,7 +102,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0" diff --git a/.test/great_expectations/expectations/web/v1/page_view_in_session_values.json b/.test/great_expectations/expectations/web/v1/page_view_in_session_values.json index 294bbfa5..996cd0d2 100644 --- a/.test/great_expectations/expectations/web/v1/page_view_in_session_values.json +++ b/.test/great_expectations/expectations/web/v1/page_view_in_session_values.json @@ -26,7 +26,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0" diff --git a/.test/great_expectations/expectations/web/v1/page_views.json b/.test/great_expectations/expectations/web/v1/page_views.json index 57ef3902..b5f194ed 100644 --- a/.test/great_expectations/expectations/web/v1/page_views.json +++ b/.test/great_expectations/expectations/web/v1/page_views.json @@ -224,7 +224,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0" diff --git a/.test/great_expectations/expectations/web/v1/sessions.json b/.test/great_expectations/expectations/web/v1/sessions.json index 908b78e0..b0234180 100644 --- a/.test/great_expectations/expectations/web/v1/sessions.json +++ b/.test/great_expectations/expectations/web/v1/sessions.json @@ -180,7 +180,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0" diff --git a/.test/great_expectations/expectations/web/v1/users.json b/.test/great_expectations/expectations/web/v1/users.json index f25db4bf..b4d2fe6e 100644 --- a/.test/great_expectations/expectations/web/v1/users.json +++ b/.test/great_expectations/expectations/web/v1/users.json @@ -116,7 +116,7 @@ ], "meta": { "versions": { - "test_suite_version": "1.1.0", + "test_suite_version": "1.1.1", "redshift_model_version": "1.2.0", "bigquery_model_version": "1.0.3", "snowflake_model_version": "1.0.0"