Add unit tests for manifest

snowplow · Dec 16, 2024 · aefe4f0 · aefe4f0
1 parent 683abae
commit aefe4f0
Show file tree

Hide file tree

Showing 15 changed files with 219 additions and 34 deletions.
diff --git a/integration_tests/.scripts/unit_tests.sh b/integration_tests/.scripts/unit_tests.sh
@@ -26,14 +26,63 @@ fi
 
 for db in ${DATABASES[@]}; do
 
+  # Run dbt seed to set up the database, this prepares the ground for int tests that come after unit tests
+
+  echo "Snowplow unified unit tests: Seeding data"
+  eval "dbt seed --full-refresh --target $db" || exit 1;
+
     # In order to test this macro we need a model reference first and also a timestamp column which the macro takes the min and max of
     # We need to make sure that the correct result is returned even if the table is empty and whether they want the output to be a low or a high set date in that case
     # All in the models folder
 
-  if [[ $BRANCH == "release" || $BRANCH == "fix/return_limits" ]]; then
+  if [[ $BRANCH == "release" || $BRANCH == "utils_revamp" ]]; then
     echo "Snowplow-utils unit tests: Run test_return_limits_from_model_macro"
     eval "dbt run --select +test_return_limits_from_model_macro expected_return_limits_from_model_macro  --target $db --full-refresh" || exit 1;
-    eval "dbt test --select +test_return_limits_from_model_macro --store-failures --target $db" || exit 1;
+    eval "dbt test --select test_return_limits_from_model_macro --store-failures --target $db" || exit 1;
+  fi
+
+    # This macro returns different queries for different states which will be used to create the base_new_event_limits table
+    # We need to make sure that the correct result is returned from this query depending on different inputs
+    # Inputs are given based on the get_incremental_manifest_status macro but we can just fake it as it returns an array 
+    # Input example: ['9999-01-01 00:00:00', '9999-01-01 00:00:00', 0, false]
+    # Inputs are read from a seed file
+
+  if [[ $BRANCH == "release" || $BRANCH == "utils_revamp" ]]; then
+    echo "Snowplow-utils unit tests: Run test_get_run_limits_macro"
+    eval "dbt run --select test_get_run_limits_macro  --target $db --full-refresh" || exit 1;
+    eval "dbt test --select test_get_run_limits_macro --store-failures --target $db" || exit 1;
+  fi
+
+    # This macro returns different queries for different states which will be used to create the base_new_event_limits table
+    # We need to make sure that the correct result is returned from this query depending on different inputs
+    # Inputs are given based on the get_incremental_manifest_status macro but we can just fake it as it returns an array 
+    # Input example: ['9999-01-01 00:00:00', '9999-01-01 00:00:00', 0, 0, false]
+    # Inputs are read from a seed file
+
+  if [[ $BRANCH == "release" || $BRANCH == "utils_revamp" ]]; then
+    echo "Snowplow-utils unit tests: Run test_get_run_limits_t_macro"
+    eval "dbt run --select test_get_run_limits_t_macro --target $db --full-refresh" || exit 1;
+    eval "dbt test --select test_get_run_limits_t_macro --store-failures --target $db" || exit 1;
+  fi
+
+    # This macro returns returns the array: [min_last_success, max_last_success, models_matched_from_manifest, has_matched_all_models]
+    # Not too important to test, it is effectively returns a min/max/count from values in the manifest based on the models in the run
+    # Inputs are read from a seed file, we can selectively test the different inputs depending on the models in run array so no need for it to contain exact scenarios upfront
+
+  if [[ $BRANCH == "release" || $BRANCH == "utils_revamp" ]]; then
+    echo "Snowplow-utils unit tests: Run test_get_incremental_manifest_status_macro"
+    eval "dbt run --select test_get_incremental_manifest_status_macro --target $db --full-refresh" || exit 1;
+    eval "dbt test --select test_get_incremental_manifest_status_macro --store-failures --target $db" || exit 1;
+  fi
+
+    # This macro returns returns the array: [min_first_processed_load_tstamp, max_first_processed_load_tstamp, min_last_processed_load_tstamp, max_last_processed_load_tstamp, models_matched_from_manifest, sync_count, has_matched_all_models]
+    # Not too important to test, it is effectively returns a min/max/count from values in the manifest based on the models in the run
+    # Inputs are read from a seed file, we can selectively test the different inputs depending on the models in run array so no need for it to contain exact scenarios upfront
+
+  if [[ $BRANCH == "release" || $BRANCH == "utils_revamp" ]]; then
+    echo "Snowplow-utils unit tests: Run test_get_incremental_manifest_status_t_macro"
+    eval "dbt run --select test_get_incremental_manifest_status_t_macro --target $db --full-refresh" || exit 1;
+    eval "dbt test --select test_get_incremental_manifest_status_t_macro --store-failures --target $db" || exit 1;
   fi
 
 done
diff --git a/integration_tests/data/incremental_hooks/data_get_incremental_manifest_status_t.csv b/integration_tests/data/incremental_hooks/data_get_incremental_manifest_status_t.csv
@@ -0,0 +1,5 @@
+model,first_processed_load_tstamp,last_processed_load_tstamp
+a,2020-01-01 00:00:00,2020-01-02 00:00:00
+b,2020-01-02 00:00:00,2020-01-03 00:00:00
+c,2020-01-03 00:00:00,2020-01-04 00:00:00
+d,2020-01-01 00:00:00,2020-01-02 00:00:00
diff --git a/integration_tests/data/incremental_hooks/data_get_incremental_manifest_status_t_expected.csv b/integration_tests/data/incremental_hooks/data_get_incremental_manifest_status_t_expected.csv
@@ -0,0 +1,3 @@
+test_case,min_first_processed_load_tstamp,max_first_processed_load_tstamp,min_last_processed_load_tstamp,max_last_processed_load_tstamp,models_matched_from_manifest,sync_count,has_matched_all_models
+all model_in_run exist in manifest,2020-01-01 00:00:00,2020-01-03 00:00:00,2020-01-02 00:00:00,2020-01-04 00:00:00,3,3,true
+some model_in_run exist in manifest,2020-01-01 00:00:00,2020-01-03 00:00:00,2020-01-02 00:00:00,2020-01-03 00:00:00,2,2,false
diff --git a/integration_tests/data/incremental_hooks/data_get_run_limits_t.csv b/integration_tests/data/incremental_hooks/data_get_run_limits_t.csv
@@ -0,0 +1,9 @@
+min_last_processed_load_tstamp,max_last_processed_load_tstamp,models_matched_from_manifest,has_matched_all_models,sync_count,start_date,lower_limit,upper_limit
+,,0,FALSE,0,2021-01-01,2021-01-01 00:00:00+00:00,2021-01-31 00:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-03-01 00:00:00+00:00,10,FALSE,1,2021-01-01,2021-01-01 00:00:00+00:00,2021-01-31 00:00:00+00:00
+2021-03-01 18:00:00+00:00,2021-03-01 18:00:00+00:00,10,TRUE,1,2021-01-01,2021-03-01 18:00:00+00:00,2021-03-31 18:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-03-01 00:00:00+00:00,10,FALSE,2,2021-01-01,1999-01-01 00:00:00+00:00,1999-01-02 00:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-03-01 00:00:00+00:00,10,FALSE,3,2021-01-01,1999-01-01 00:00:00+00:00,1999-01-02 00:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-03-05 00:00:00+00:00,10,TRUE,3,2021-01-01,1999-01-01 00:00:00+00:00,1999-01-02 00:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-05-01 00:00:00+00:00,10,TRUE,3,2021-01-01,1999-01-01 00:00:00+00:00,1999-01-02 00:00:00+00:00
+2021-03-01 00:00:00+00:00,2021-03-01 00:00:00+00:00,10,TRUE,3,2021-01-01,1999-01-01 00:00:00+00:00,1999-01-02 00:00:00+00:00
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -71,6 +71,7 @@ vars:
     snowplow__dev_target_name: dev
     snowplow__databricks_catalog: 'hive_metastore'
     snowplow__query_tag: 'snowplow_dbt'
+    snowplow__testing: true
 
 models:
   snowplow_utils_integration_tests:
@@ -138,13 +139,30 @@ seeds:
         +column_types:
           min_last_success: timestamp
           max_last_success: timestamp
+      data_get_incremental_manifest_status_t:
+        +column_types:
+          first_processed_load_tstamp: timestamp
+          last_processed_load_tstamp: timestamp
+      data_get_incremental_manifest_status_t_expected:
+        +column_types:
+          min_first_processed_load_tstamp: timestamp
+          max_first_processed_load_tstamp: timestamp
+          min_last_processed_load_tstamp: timestamp
+          max_last_processed_load_tstamp: timestamp
       data_get_run_limits:
         +column_types:
           min_last_success: timestamp
           max_last_success: timestamp
           start_date: date
           lower_limit: timestamp
           upper_limit: timestamp
+      data_get_run_limits_t:
+        +column_types:
+          min_last_success: timestamp
+          max_last_success: timestamp
+          start_date: date
+          lower_limit: timestamp
+          upper_limit: timestamp
       data_update_incremental_manifest_table:
         +column_types:
           is_in_manifest: boolean

diff --git a/integration_tests/models/incremental_hooks/incremental_hooks.yml b/integration_tests/models/incremental_hooks/incremental_hooks.yml
@@ -1,16 +1,6 @@
 version: 2
 
 models:
-  - name: test_get_incremental_manifest_status
-    tests:
-      - dbt_utils.equality:
-          compare_model: ref('data_get_incremental_manifest_status_expected')
-  - name: test_get_run_limits
-    tests:
-      - dbt_utils.expression_is_true:
-          expression: "expected_lower_limit = actual_lower_limit"
-      - dbt_utils.expression_is_true:
-          expression: "expected_upper_limit = actual_upper_limit"
   - name: test_update_incremental_manifest_table
     tests:
       - dbt_utils.equality:

diff --git a/integration_tests/models/unit_tests/test_get_incremental_manifest_status_macro/schema.yml b/integration_tests/models/unit_tests/test_get_incremental_manifest_status_macro/schema.yml
@@ -0,0 +1,7 @@
+version: 2
+
+models:
+  - name: test_get_incremental_manifest_status_macro
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('data_get_incremental_manifest_status_expected')
diff --git a/.../test_get_incremental_manifest_status.sql → ...get_incremental_manifest_status_macro.sql b/.../test_get_incremental_manifest_status.sql → ...get_incremental_manifest_status_macro.sql
diff --git a/integration_tests/models/unit_tests/test_get_incremental_manifest_status_t_macro/schema.yml b/integration_tests/models/unit_tests/test_get_incremental_manifest_status_t_macro/schema.yml
@@ -0,0 +1,7 @@
+version: 2
+
+models:
+  - name: test_get_incremental_manifest_status_t_macro
+    tests:
+      - dbt_utils.equality:
+          compare_model: ref('data_get_incremental_manifest_status_t_expected')
diff --git a/..._get_incremental_manifest_status_t_macro/test_get_incremental_manifest_status_t_macro.sql b/..._get_incremental_manifest_status_t_macro/test_get_incremental_manifest_status_t_macro.sql
@@ -0,0 +1,46 @@
+{#
+Copyright (c) 2021-present Snowplow Analytics Ltd. All rights reserved.
+This program is licensed to you under the Snowplow Personal and Academic License Version 1.0,
+and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0.
+You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/
+#}
+
+{%- set all_models = snowplow_utils.get_incremental_manifest_status_t(ref('data_get_incremental_manifest_status_t'), ['a','b','c']) -%}
+{%- set partial_models = snowplow_utils.get_incremental_manifest_status_t(ref('data_get_incremental_manifest_status_t'), ['b','d','e']) -%}
+
+with prep as (
+select
+  'all model_in_run exist in manifest' as test_case,
+  {{ snowplow_utils.cast_to_tstamp(all_models[0]) }} as min_first_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(all_models[1]) }} as max_first_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(all_models[2]) }} as min_last_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(all_models[3]) }} as max_last_processed_load_tstamp,
+  {{all_models[4]}} as models_matched_from_manifest,
+  {{all_models[5]}} as sync_count,
+  {{all_models[6]}} as has_matched_all_models
+
+union all
+
+select
+  'some model_in_run exist in manifest' as test_case,
+  {{ snowplow_utils.cast_to_tstamp(all_models[0]) }} as min_first_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(all_models[1]) }} as max_first_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(partial_models[2]) }} as min_last_processed_load_tstamp,
+  {{ snowplow_utils.cast_to_tstamp(partial_models[3]) }} as max_last_processed_load_tstamp,
+  {{partial_models[4]}} as models_matched_from_manifest,
+  {{partial_models[5]}} as sync_count,
+  {{partial_models[6]}} as has_matched_all_models
+
+)
+
+select
+  test_case,
+  min_first_processed_load_tstamp,
+  max_first_processed_load_tstamp,
+  min_last_processed_load_tstamp,
+  max_last_processed_load_tstamp,
+  models_matched_from_manifest,
+  sync_count,
+  cast(has_matched_all_models as {{ dbt.type_boolean() }}) as has_matched_all_models
+
+from prep
diff --git a/integration_tests/models/unit_tests/test_get_run_limits_macro/schema.yml b/integration_tests/models/unit_tests/test_get_run_limits_macro/schema.yml
@@ -0,0 +1,9 @@
+version: 2
+
+models:
+  - name: test_get_run_limits_macro
+    tests:
+      - dbt_utils.expression_is_true:
+          expression: "expected_lower_limit = actual_lower_limit"
+      - dbt_utils.expression_is_true:
+          expression: "expected_upper_limit = actual_upper_limit"
diff --git a/...incremental_hooks/test_get_run_limits.sql → ...imits_macro/test_get_run_limits_macro.sql b/...incremental_hooks/test_get_run_limits.sql → ...imits_macro/test_get_run_limits_macro.sql
diff --git a/integration_tests/models/unit_tests/test_get_run_limits_t_macro/schema.yml b/integration_tests/models/unit_tests/test_get_run_limits_t_macro/schema.yml
@@ -0,0 +1,9 @@
+version: 2
+
+models:
+  - name: test_get_run_limits_t_macro
+    tests:
+      - dbt_utils.expression_is_true:
+          expression: "expected_lower_limit = actual_lower_limit"
+      - dbt_utils.expression_is_true:
+          expression: "expected_upper_limit = actual_upper_limit"
diff --git a/...ation_tests/models/unit_tests/test_get_run_limits_t_macro/test_get_run_limits_t_macro.sql b/...ation_tests/models/unit_tests/test_get_run_limits_t_macro/test_get_run_limits_t_macro.sql
@@ -0,0 +1,55 @@
+{#
+Copyright (c) 2021-present Snowplow Analytics Ltd. All rights reserved.
+This program is licensed to you under the Snowplow Personal and Academic License Version 1.0,
+and you may not use this file except in compliance with the Snowplow Personal and Academic License Version 1.0.
+You may obtain a copy of the Snowplow Personal and Academic License Version 1.0 at https://docs.snowplow.io/personal-and-academic-license-1.0/
+#}
+
+{%- set data_query -%}
+  select * from {{ ref('data_get_run_limits_t') }}
+{%- endset -%}
+
+{# fetch test data set as dict. dict form {column_name: (tuple_of_results) #}
+{%- set raw_test_data = dbt_utils.get_query_results_as_dict(data_query) -%}
+
+{# Snowflake returns keys as uppercase. Iterate and set to lowercase #}
+{% set test_data = {} %}
+{% for key, value in raw_test_data.items() %}
+  {% do test_data.update({key.lower(): value}) %}
+{% endfor %}
+
+{% for i in range(test_data.min_last_processed_load_tstamp|length) %}
+
+  {# iteratively pass each row of test data into get_run_limits_t() and execute returned query
+  min_first_processed_load_tstamp and max_first_processed_load_tstamp are not yet used, placeholder in place #}
+  {%- set results = run_query(snowplow_utils.get_run_limits_t('9999-01-01 00:00:00',
+                                                              '9999-01-01 00:00:00',
+                                                           test_data.min_last_processed_load_tstamp[i],
+                                                           test_data.max_last_processed_load_tstamp[i],
+                                                           test_data.models_matched_from_manifest[i],
+                                                           test_data.sync_count[i],
+                                                           test_data.has_matched_all_models[i],
+                                                           test_data.start_date[i])) -%}
+
+  {# expected limits taken from test data #}
+  {%- set expected_lower_limit = test_data.lower_limit[i] -%}
+  {%- set expected_upper_limit = test_data.upper_limit[i] -%}
+
+  {# actual limits taken from get_run_limits_t() results #}
+  {%- if execute -%}
+    {%- set actual_lower_limit = results.columns[0].values()[0] -%}
+    {%- set actual_upper_limit = results.columns[1].values()[0] -%}
+  {%- else -%}
+    {%- set actual_lower_limit = none -%}
+    {%- set actual_upper_limit = none -%}
+  {%- endif -%}
+
+  {# union expected vs. actual for each test case #}
+  select
+    {{snowplow_utils.cast_to_tstamp(expected_lower_limit)}} as expected_lower_limit,
+    {{snowplow_utils.cast_to_tstamp(expected_upper_limit)}} as expected_upper_limit,
+    {{snowplow_utils.cast_to_tstamp(actual_lower_limit)}} as actual_lower_limit,
+    {{snowplow_utils.cast_to_tstamp(actual_upper_limit)}} as actual_upper_limit
+  {% if not loop.last %} union all {% endif %}
+
+{% endfor %}
diff --git a/unit_tests/unit_tests.sh b/unit_tests/unit_tests.sh