From 0e6dfe86419a6a4e5a6c306b65964714330ca34e Mon Sep 17 00:00:00 2001 From: Ilias Xenogiannis Date: Wed, 19 Jun 2024 13:40:05 +0300 Subject: [PATCH] Pe 6324 more robust init tests (#57) Add optional initial checks for missing fields based on context variables --- dbt_project.yml | 3 +- .../.scripts/integration_test.sh | 13 +- integration_tests/dbt_project.yml | 1 + .../snowplow_unified_events_stg.sql | 11 +- macros/context_existance_check.sql | 162 ++++++++++++++++++ macros/seed_existance_check.sql | 4 +- .../snowplow_unified_incremental_manifest.sql | 5 +- 7 files changed, 181 insertions(+), 18 deletions(-) create mode 100644 macros/context_existance_check.sql diff --git a/dbt_project.yml b/dbt_project.yml index f7b39416..c6283f90 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -66,7 +66,8 @@ vars: # snowplow__total_all_conversions: false snowplow__upsert_lookback_days: 30 snowplow__use_refr_if_mkt_null: false - + snowplow__enable_initial_checks: false + # please refer to the macros within identifiers.sql for default values snowplow__session_identifiers: [] snowplow__user_identifiers: [] diff --git a/integration_tests/.scripts/integration_test.sh b/integration_tests/.scripts/integration_test.sh index 48555b53..a7a7b605 100755 --- a/integration_tests/.scripts/integration_test.sh +++ b/integration_tests/.scripts/integration_test.sh @@ -23,17 +23,6 @@ fi for db in ${DATABASES[@]}; do - echo "Snowplow unified integration tests: Running without seeding data" - dbt run --full-refresh --target $db - - status=$? - - if [ $status -ne 0 ] && [ $status -ne 1 ]; then - echo "dbt command failed for target $db with exit status $status" - exit $status - - fi - echo "Snowplow unified integration tests: Seeding data" eval "dbt seed --full-refresh --target $db" || exit 1; @@ -88,4 +77,4 @@ for db in ${DATABASES[@]}; do echo "Snowplow unified integration tests: Mobile screen engagement tests passed" -done +done \ No newline at end of file diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index ce70cdb3..2d8e6e09 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -90,6 +90,7 @@ vars: snowplow__enable_screen_context: true snowplow__enable_app_errors: true snowplow__enable_deep_link_context: true + snowplow__enable_initial_checks: false # These are at a full project level because they are using in both the unified and int test models to determine which columns are needed snowplow__enable_consent: true snowplow__enable_cwv: true diff --git a/integration_tests/models/source/databricks/snowplow_unified_events_stg.sql b/integration_tests/models/source/databricks/snowplow_unified_events_stg.sql index cf390b0d..412c6a2a 100644 --- a/integration_tests/models/source/databricks/snowplow_unified_events_stg.sql +++ b/integration_tests/models/source/databricks/snowplow_unified_events_stg.sql @@ -545,7 +545,16 @@ select unstruct_event_com_snowplowanalytics_snowplow_web_vitals_1[0].navigation_type::STRING as navigation_type) as unstruct_event_com_snowplowanalytics_snowplow_web_vitals_1, contexts_nl_basjes_yauaa_context_1, contexts_com_iab_snowplow_spiders_and_robots_1, - struct(''::STRING as basis_for_processing, ''::STRING as id, ''::STRING as name, ''::STRING as previous_id, ''::STRING as transition_type, '' as type) as unstruct_event_com_snowplowanalytics_mobile_screen_view_1 + struct(''::STRING as basis_for_processing, ''::STRING as id, ''::STRING as name, ''::STRING as previous_id, ''::STRING as transition_type, '' as type) as unstruct_event_com_snowplowanalytics_mobile_screen_view_1, + NULL as contexts_com_snowplowanalytics_snowplow_ua_parser_context_1, + NULL as contexts_com_snowplowanalytics_snowplow_client_session_1, + NULL as contexts_com_snowplowanalytics_snowplow_geolocation_context_1, + NULL as contexts_com_snowplowanalytics_mobile_application_1, + NULL as contexts_com_snowplowanalytics_mobile_deep_link_1, + NULL as contexts_com_snowplowanalytics_snowplow_browser_context_1, + NULL as contexts_com_snowplowanalytics_snowplow_mobile_context_1, + NULL as contexts_com_snowplowanalytics_mobile_screen_1, + NULL as unstruct_event_com_snowplowanalytics_snowplow_application_error_1 from prep {% endif %} diff --git a/macros/context_existance_check.sql b/macros/context_existance_check.sql new file mode 100644 index 00000000..e8bc9b96 --- /dev/null +++ b/macros/context_existance_check.sql @@ -0,0 +1,162 @@ +{# +Copyright (c) 2023-present Snowplow Analytics Ltd. All rights reserved. +This program is licensed to you under the Snowplow Personal and Academic License Version 1.0, +and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0. +You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/ +#} + + +{% macro context_existance_check() %} + + {% set contexts = { + "snowplow__enable_mobile_context": [ + 'contexts_com_snowplowanalytics_snowplow_mobile_context_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__mobile_context') + ], + "snowplow__enable_iab": [ + 'contexts_com_iab_snowplow_spiders_and_robots_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__iab_context') + ], + "snowplow__enable_yauaa": [ + 'contexts_nl_basjes_yauaa_context_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__yauaa_context') + ], + "snowplow__enable_ua": [ + 'contexts_com_snowplowanalytics_snowplow_ua_parser_context_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__ua_parser_context') + ], + "snowplow__enable_application_context": [ + 'contexts_com_snowplowanalytics_mobile_application_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__application_context') + ], + "snowplow__enable_browser_context": [ + 'contexts_com_snowplowanalytics_snowplow_web_page_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__browser_context') + ], + "snowplow__enable_geolocation_context": [ + 'contexts_com_snowplowanalytics_snowplow_geolocation_context_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__geolocation_context') + ], + "snowplow__enable_screen_context": [ + 'contexts_com_snowplowanalytics_mobile_screen_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__screen_context') + ], + "snowplow__enable_deep_link_context": [ + 'contexts_com_snowplowanalytics_mobile_deep_link_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__deep_link_context') + ], + "snowplow__enable_screen_summary_context": [ + 'contexts_com_snowplowanalytics_mobile_screen_summary_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__screen_summary_context') + ], + "snowplow__enable_consent": [ + 'unstruct_event_com_snowplowanalytics_snowplow_cmp_visible_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__cmp_visible_events'), + 'unstruct_event_com_snowplowanalytics_snowplow_consent_preferences_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__consent_preferences_events') + ], + "snowplow__enable_cwv": [ + 'unstruct_event_com_snowplowanalytics_snowplow_web_vitals_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__cwv_events') + ], + "snowplow__enable_app_errors": [ + 'unstruct_event_com_snowplowanalytics_snowplow_application_error_1' if target.type not in ['redshift', 'postgres'] else var('snowplow__application_error_events') + ] + } + + %} + + {{ return(adapter.dispatch('context_existance_check', 'snowplow_unified')(contexts)) }} + +{% endmacro %} + +{% macro default__context_existance_check(contexts) %} + + {% if execute %} + {%- if flags.WHICH in ('run', 'run-operation') and var('snowplow__enable_initial_checks',false) -%} + + {% set relation = adapter.get_relation( + database= var('snowplow__database', target.database) if target.type not in ['databricks', 'spark'] else var('snowplow__databricks_catalog', 'hive_metastore') if target.type in ['databricks'] else var('snowplow__atomic_schema', 'atomic'), + schema=var('snowplow__atomic_schema', 'atomic'), + identifier=var('snowplow__events_table', 'events')) + %} + {% if relation %} + {% set available_contexts = dbt_utils.get_filtered_columns_in_relation(relation) %} + {% set available_contexts = available_contexts | map("lower") | list %} + + {# Loop through contexts dictionary keys #} + {% for context_key, context_value in contexts.items() %} + + {# Check if the context flag is true and if we should check the existance of the columns #} + {% if var(context_key, false) | as_bool() %} + + {# In case we have multiple (e.g consent loop through all the fields needed )#} + {% for context_value_i in context_value %} + + {% set flags = [0] %} + + {# Looping through all available contexts #} + {% for available_context in available_contexts %} + {# we split by the column we want, if its a perfect match it will have a result of ["",""] other wise if its a suffix it will result in {"", "XXXXX"} #} + {% if available_context.split(context_value_i)[0] | length == 0 %} + {% if flags[0] == 0 %} + {% set _ = flags.append(1) %} + {% set _ = flags.pop(0) %} + {% endif %} + {% endif %} + {% endfor %} + + {% if flags[0] == 0 %} + {{ log(relation, info = true)}} + {{ log(available_contexts, info=true) }} + {{ exceptions.raise_compiler_error( + "Snowplow Error: " ~ context_value_i ~ " column not found in " ~ relation ~". Please ensure the column is present when " ~ context_key ~ " is enabled." + )}} + {% endif %} + + {% endfor %} + {% endif %} + {% endfor %} + {% endif %} + {% endif %} + {% endif %} + +{% endmacro %} + +{% macro postgres__context_existance_check(contexts) %} + + {% if execute %} + {%- if flags.WHICH in ('run', 'run-operation') and var('snowplow__enable_initial_checks',false) -%} + {% set relation = adapter.get_relation( + database= var('snowplow__database', target.database) if target.type not in ['databricks', 'spark'] else var('snowplow__databricks_catalog', 'hive_metastore') if target.type in ['databricks'] else var('snowplow__atomic_schema', 'atomic'), + schema=var('snowplow__atomic_schema', 'atomic'), + identifier=var('snowplow__events_table', 'events')) + %} + {% if relation %} + + {# Loop through contexts dictionary keys #} + {% for context_key, context_value in contexts.items() %} + + {# Check if the context flag is true and if we should check the existance of the columns #} + {% if var(context_key, false) | as_bool() %} + + {# In case we have multiple (e.g consent loop through all the fields needed )#} + {% for context_value_i in context_value %} + + {% set flags = [0] %} + + {# Looping through all available contexts #} + {# we split by the column we want, if its a perfect match it will have a result of ["",""] other wise if its a suffix it will result in {"", "XXXXX"} #} + + {% set relations = dbt_utils.get_relations_by_pattern(schema_pattern=var('snowplow__atomic_schema', 'atomic'), table_pattern= context_value_i) %} + + {# Check if the relation exists by assessing the length of the relations list #} + {% if relations | length > 0 %} + {% if flags[0] == 0 %} + {% set _ = flags.append(1) %} + {% set _ = flags.pop(0) %} + {% endif %} + {% endif %} + + {% if flags[0] == 0 %} + {{ log("Relations : " ~ relations, info = true) }} + {{ exceptions.raise_compiler_error( + "Snowplow Error: " ~ context_value_i ~ " table not found. Please ensure the table is present when " ~ context_key ~ " is enabled." + )}} + {% endif %} + + {% endfor %} + {% endif %} + {% endfor %} + {% endif %} + {% endif %} + {% endif %} + +{% endmacro %} \ No newline at end of file diff --git a/macros/seed_existance_check.sql b/macros/seed_existance_check.sql index 25bbd297..f0cc43e1 100644 --- a/macros/seed_existance_check.sql +++ b/macros/seed_existance_check.sql @@ -13,7 +13,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 {% if execute %} {# Ensure that this check is only performed during 'run' or 'build' commands #} {# Log the flags.WHICH#} - {%- if flags.WHICH in ('run', 'run-operation') -%} + {%- if flags.WHICH in ('run', 'run-operation') and var('snowplow__enable_initial_checks',false) -%} {% for node in graph.nodes.values() | selectattr("resource_type", "equalto", "seed") | selectattr("package_name", "equalto", "snowplow_unified") %} {% set schema = node.schema %} @@ -38,7 +38,7 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 {% macro spark__seed_existance_check() %} {% if execute %} {# Ensure that this check is only performed during 'run' or 'build' commands #} - {%- if flags.WHICH in ('run', 'run-operation') -%} + {%- if flags.WHICH in ('run', 'run-operation') and var('snowplow__enable_initial_checks',false) -%} {% for node in graph.nodes.values() | selectattr("resource_type", "equalto", "seed") | selectattr("package_name", "equalto", "snowplow_unified") %} {% set schema = node.schema %} diff --git a/models/base/manifest/snowplow_unified_incremental_manifest.sql b/models/base/manifest/snowplow_unified_incremental_manifest.sql index ff02fd9f..3703ed0d 100644 --- a/models/base/manifest/snowplow_unified_incremental_manifest.sql +++ b/models/base/manifest/snowplow_unified_incremental_manifest.sql @@ -14,8 +14,9 @@ You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 'delta.autoOptimize.optimizeWrite' : 'true', 'delta.autoOptimize.autoCompact' : 'true' }, - pre_hook="{{ snowplow_unified.seed_existance_check() }}", - ) + pre_hook=["{{ snowplow_unified.seed_existance_check()}}", + "{{ snowplow_unified.context_existance_check()}}"], + ) }} {% set incremental_manifest_query = snowplow_utils.base_create_snowplow_incremental_manifest() %}