From 344c0913e3d3d020d2b664f85bfed06a6c5fb751 Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Wed, 15 Nov 2023 17:25:30 -0800 Subject: [PATCH 1/9] Define `event_monitoring_live_v1` views in `view.sql` files. So they get automatically deployed by the `bqetl_artifact_deployment.publish_views` Airflow task. --- bqetl_project.yaml | 2 +- sql_generators/glean_usage/event_monitoring_live.py | 8 ++++---- ...live_v1.init.sql => event_monitoring_live_v1.view.sql} | 0 3 files changed, 5 insertions(+), 5 deletions(-) rename sql_generators/glean_usage/templates/{event_monitoring_live_v1.init.sql => event_monitoring_live_v1.view.sql} (100%) diff --git a/bqetl_project.yaml b/bqetl_project.yaml index 8e43dd257d8..44dc3419c72 100644 --- a/bqetl_project.yaml +++ b/bqetl_project.yaml @@ -188,7 +188,7 @@ dry_run: - sql/moz-fx-data-shared-prod/org_mozilla_firefox_beta_derived/experiment_events_live_v1/init.sql - sql/moz-fx-data-shared-prod/telemetry_derived/experiment_enrollment_cumulative_population_estimate_v1/view.sql - sql/moz-fx-data-shared-prod/telemetry/experiment_enrollment_cumulative_population_estimate/view.sql - - sql/moz-fx-data-shared-prod/**/event_monitoring_live_v1/init.sql + - sql/moz-fx-data-shared-prod/**/event_monitoring_live_v1/view.sql - sql/moz-fx-data-shared-prod/monitoring/event_monitoring_live/view.sql # Already exists (and lacks an "OR REPLACE" clause) - sql/moz-fx-data-shared-prod/org_mozilla_firefox_derived/clients_first_seen_v1/init.sql diff --git a/sql_generators/glean_usage/event_monitoring_live.py b/sql_generators/glean_usage/event_monitoring_live.py index 227fb359369..37f943601a0 100644 --- a/sql_generators/glean_usage/event_monitoring_live.py +++ b/sql_generators/glean_usage/event_monitoring_live.py @@ -40,7 +40,7 @@ def generate_per_app_id( ): tables = table_names_from_baseline(baseline_table, include_project_id=False) - init_filename = f"{self.target_table_id}.init.sql" + view_filename = f"{self.target_table_id}.view.sql" metadata_filename = f"{self.target_table_id}.metadata.yaml" table = tables[f"{self.prefix}"] @@ -69,8 +69,8 @@ def generate_per_app_id( artifacts = [] if not self.no_init: - init_sql = render( - init_filename, template_folder=PATH / "templates", **render_kwargs + view_sql = render( + view_filename, template_folder=PATH / "templates", **render_kwargs ) metadata = render( metadata_filename, @@ -84,7 +84,7 @@ def generate_per_app_id( if output_dir: if not self.no_init: - artifacts.append(Artifact(table, "init.sql", init_sql)) + artifacts.append(Artifact(table, "view.sql", view_sql)) for artifact in artifacts: destination = ( diff --git a/sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql b/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql similarity index 100% rename from sql_generators/glean_usage/templates/event_monitoring_live_v1.init.sql rename to sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql From ffb4abe0fc5e66638570323c1e8ab38c46623808 Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 15:31:02 -0800 Subject: [PATCH 2/9] Overwrite existing `event_monitoring_live_v1` views on deployment. --- .../event_monitoring_live_v1.view.sql | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql b/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql index 141c3fa893d..1396e61148f 100644 --- a/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql +++ b/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql @@ -1,9 +1,11 @@ -CREATE MATERIALIZED VIEW -IF - NOT EXISTS `{{ project_id }}.{{ derived_dataset }}.event_monitoring_live_v1` - OPTIONS - (enable_refresh = TRUE, refresh_interval_minutes = 60) AS - {% if dataset_id not in ["telemetry", "accounts_frontend", "accounts_backend"] %} +CREATE OR REPLACE MATERIALIZED VIEW + `{{ project_id }}.{{ derived_dataset }}.event_monitoring_live_v1` +OPTIONS( + enable_refresh = TRUE, + refresh_interval_minutes = 60 +) +AS + {% if dataset_id not in ["telemetry", "accounts_frontend", "accounts_backend"] %} SELECT DATE(submission_timestamp) AS submission_date, TIMESTAMP_ADD( @@ -93,7 +95,7 @@ IF UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index LEFT JOIN UNNEST(event.extra) AS event_extra - {% elif dataset_id in ["accounts_frontend", "accounts_backend"] %} + {% elif dataset_id in ["accounts_frontend", "accounts_backend"] %} -- FxA uses custom pings to send events without a category and extras. SELECT DATE(submission_timestamp) AS submission_date, @@ -155,7 +157,7 @@ IF -- Iterator for accessing experiments. -- Add one more for aggregating events across all experiments UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index - {% endif %} + {% endif %} WHERE DATE(submission_timestamp) >= "{{ current_date }}" GROUP BY From a472eb37b797b86cddf4aedd6f7d5aecb06b2b5a Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 15:33:00 -0800 Subject: [PATCH 3/9] Support materialized views in view naming validation. --- bigquery_etl/view/__init__.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index afb1e5ede56..b0248103169 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -192,10 +192,17 @@ def _valid_view_naming(self): ] is_view_statement = ( " ".join(tokens[0].normalized.split()) == "CREATE OR REPLACE" - and tokens[1].normalized == "VIEW" + and ( + tokens[1].normalized == "VIEW" + or ( + tokens[1].normalized.upper() == "MATERIALIZED" + and tokens[2].normalized == "VIEW" + ) + ) ) if is_view_statement: - target_view = str(tokens[2]).strip().split()[0] + view_id_token = tokens[2] if tokens[1].normalized == "VIEW" else tokens[3] + target_view = str(view_id_token).strip().split()[0] try: [project_id, dataset_id, view_id] = target_view.replace("`", "").split( "." From eb7e0c3f77ff6e193c6cd9d1e82478a41092222c Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 15:44:27 -0800 Subject: [PATCH 4/9] Update `EventMonitoringLive.no_init` logic. --- .../glean_usage/event_monitoring_live.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/sql_generators/glean_usage/event_monitoring_live.py b/sql_generators/glean_usage/event_monitoring_live.py index 37f943601a0..a1612435e4c 100644 --- a/sql_generators/glean_usage/event_monitoring_live.py +++ b/sql_generators/glean_usage/event_monitoring_live.py @@ -26,7 +26,7 @@ class EventMonitoringLive(GleanTable): def __init__(self) -> None: """Initialize materialized view generation.""" - self.no_init = False + self.no_init = True self.per_app_id_enabled = True self.per_app_enabled = False self.across_apps_enabled = True @@ -38,6 +38,7 @@ def __init__(self) -> None: def generate_per_app_id( self, project_id, baseline_table, output_dir=None, use_cloud_function=True ): + """Generate per-app_id views.""" tables = table_names_from_baseline(baseline_table, include_project_id=False) view_filename = f"{self.target_table_id}.view.sql" @@ -68,23 +69,21 @@ def generate_per_app_id( Artifact = namedtuple("Artifact", "table_id basename sql") artifacts = [] - if not self.no_init: - view_sql = render( - view_filename, template_folder=PATH / "templates", **render_kwargs - ) - metadata = render( - metadata_filename, - template_folder=PATH / "templates", - format=False, - **render_kwargs, - ) - artifacts.append(Artifact(table, "metadata.yaml", metadata)) + view_sql = render( + view_filename, template_folder=PATH / "templates", **render_kwargs + ) + metadata = render( + metadata_filename, + template_folder=PATH / "templates", + format=False, + **render_kwargs, + ) + artifacts.append(Artifact(table, "metadata.yaml", metadata)) skip_existing_artifact = self.skip_existing(output_dir, project_id) if output_dir: - if not self.no_init: - artifacts.append(Artifact(table, "view.sql", view_sql)) + artifacts.append(Artifact(table, "view.sql", view_sql)) for artifact in artifacts: destination = ( From de673f320cbefd11837da88d82d16d37c27b29eb Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 15:54:17 -0800 Subject: [PATCH 5/9] Apply formatting required by Black. --- bigquery_etl/view/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index b0248103169..850ca0bf13c 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -190,14 +190,13 @@ def _valid_view_naming(self): for t in parsed.tokens if not (t.is_whitespace or isinstance(t, sqlparse.sql.Comment)) ] - is_view_statement = ( - " ".join(tokens[0].normalized.split()) == "CREATE OR REPLACE" - and ( - tokens[1].normalized == "VIEW" - or ( - tokens[1].normalized.upper() == "MATERIALIZED" - and tokens[2].normalized == "VIEW" - ) + is_view_statement = " ".join( + tokens[0].normalized.split() + ) == "CREATE OR REPLACE" and ( + tokens[1].normalized == "VIEW" + or ( + tokens[1].normalized.upper() == "MATERIALIZED" + and tokens[2].normalized == "VIEW" ) ) if is_view_statement: From 4372bd921bac3cfed122cc5b9e616c8bde7cccb0 Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 16:02:15 -0800 Subject: [PATCH 6/9] Don't try to overwrite existing `event_monitoring_live_v1` views on deployment. BigQuery doesn't currently allow us to replace existing materialized views. --- bigquery_etl/view/__init__.py | 7 ++++--- .../event_monitoring_live_v1.view.sql | 18 ++++++++---------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index 850ca0bf13c..55c27bb223b 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -190,9 +190,10 @@ def _valid_view_naming(self): for t in parsed.tokens if not (t.is_whitespace or isinstance(t, sqlparse.sql.Comment)) ] - is_view_statement = " ".join( - tokens[0].normalized.split() - ) == "CREATE OR REPLACE" and ( + is_view_statement = " ".join(tokens[0].normalized.split()) in ( + "CREATE", + "CREATE OR REPLACE", + ) and ( tokens[1].normalized == "VIEW" or ( tokens[1].normalized.upper() == "MATERIALIZED" diff --git a/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql b/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql index 56c823b96fc..846efef69fc 100644 --- a/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql +++ b/sql_generators/glean_usage/templates/event_monitoring_live_v1.view.sql @@ -1,11 +1,9 @@ -CREATE OR REPLACE MATERIALIZED VIEW - `{{ project_id }}.{{ derived_dataset }}.event_monitoring_live_v1` -OPTIONS( - enable_refresh = TRUE, - refresh_interval_minutes = 60 -) -AS - {% if dataset_id not in ["telemetry", "accounts_frontend", "accounts_backend"] %} +CREATE MATERIALIZED VIEW +IF + NOT EXISTS `{{ project_id }}.{{ derived_dataset }}.event_monitoring_live_v1` + OPTIONS + (enable_refresh = TRUE, refresh_interval_minutes = 60) AS + {% if dataset_id not in ["telemetry", "accounts_frontend", "accounts_backend"] %} SELECT DATE(submission_timestamp) AS submission_date, TIMESTAMP_ADD( @@ -95,7 +93,7 @@ AS UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index LEFT JOIN UNNEST(event.extra) AS event_extra - {% elif dataset_id in ["accounts_frontend", "accounts_backend"] %} + {% elif dataset_id in ["accounts_frontend", "accounts_backend"] %} -- FxA uses custom pings to send events without a category and extras. SELECT DATE(submission_timestamp) AS submission_date, @@ -157,7 +155,7 @@ AS -- Iterator for accessing experiments. -- Add one more for aggregating events across all experiments UNNEST(GENERATE_ARRAY(0, ARRAY_LENGTH(ping_info.experiments))) AS experiment_index - {% endif %} + {% endif %} WHERE DATE(submission_timestamp) >= "{{ current_date }}" GROUP BY From efe99b00c16abd5b30b67d9ba09de7caa787458d Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Thu, 16 Nov 2023 19:55:26 -0800 Subject: [PATCH 7/9] Handle `IF NOT EXISTS` in view naming validation. --- bigquery_etl/view/__init__.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index 55c27bb223b..096cbcbc2fc 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -195,18 +195,23 @@ def _valid_view_naming(self): "CREATE OR REPLACE", ) and ( tokens[1].normalized == "VIEW" - or ( - tokens[1].normalized.upper() == "MATERIALIZED" - and tokens[2].normalized == "VIEW" - ) + or " ".join(t.normalized for t in tokens[1:3]).upper() + == "MATERIALIZED VIEW" ) if is_view_statement: - view_id_token = tokens[2] if tokens[1].normalized == "VIEW" else tokens[3] - target_view = str(view_id_token).strip().split()[0] - try: - [project_id, dataset_id, view_id] = target_view.replace("`", "").split( - "." + view_kw_index = 1 if tokens[1].normalized == "VIEW" else 2 + if ( + " ".join( + t.normalized for t in tokens[view_kw_index + 1 : view_kw_index + 4] ) + == "IF NOT EXISTS" + ): + view_id_token = tokens[view_kw_index + 4] + else: + view_id_token = tokens[view_kw_index + 1] + target_view = str(view_id_token).replace("`", "").strip().split()[0] + try: + [project_id, dataset_id, view_id] = target_view.split(".") if not ( self.name == view_id and self.dataset == dataset_id From 12808ceabf12b2c60999cb1854ba66287c760148 Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Tue, 28 Nov 2023 08:29:31 -0800 Subject: [PATCH 8/9] Use regular expression to extract view ID in view naming validation. This simplifies the logic and avoids a sqlparse bug where it doesn't recognize the `MATERIALIZED` keyword. --- bigquery_etl/view/__init__.py | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index 096cbcbc2fc..8ac6479a14e 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -184,32 +184,14 @@ def _valid_fully_qualified_references(self): def _valid_view_naming(self): """Validate that the created view naming matches the directory structure.""" - parsed = sqlparse.parse(self.content)[0] - tokens = [ - t - for t in parsed.tokens - if not (t.is_whitespace or isinstance(t, sqlparse.sql.Comment)) - ] - is_view_statement = " ".join(tokens[0].normalized.split()) in ( - "CREATE", - "CREATE OR REPLACE", - ) and ( - tokens[1].normalized == "VIEW" - or " ".join(t.normalized for t in tokens[1:3]).upper() - == "MATERIALIZED VIEW" - ) - if is_view_statement: - view_kw_index = 1 if tokens[1].normalized == "VIEW" else 2 - if ( - " ".join( - t.normalized for t in tokens[view_kw_index + 1 : view_kw_index + 4] - ) - == "IF NOT EXISTS" - ): - view_id_token = tokens[view_kw_index + 4] - else: - view_id_token = tokens[view_kw_index + 1] - target_view = str(view_id_token).replace("`", "").strip().split()[0] + sql = sqlparse.format(self.content, strip_comments=True).strip() + if view_statement_match := re.match( + r"CREATE(?:\s+OR\s+REPLACE)?(?:\s+MATERIALIZED)?\s+VIEW(?:\s+IF\s+NOT\s+EXISTS)?" + r"\s+(?P(?:(?:`?[\w-]+`?\.)?`?\w+`?\.)?`?\w+`?)", + sql, + re.IGNORECASE, + ): + target_view = view_statement_match["view_id"].replace("`", "") try: [project_id, dataset_id, view_id] = target_view.split(".") if not ( From 7a6651b7363495762343bfc30d4f377ad0f8aadd Mon Sep 17 00:00:00 2001 From: Sean Rose Date: Tue, 28 Nov 2023 08:54:16 -0800 Subject: [PATCH 9/9] Update other view regular expressions to allow for materialized views. --- bigquery_etl/view/__init__.py | 3 ++- sql_generators/stable_views/__init__.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bigquery_etl/view/__init__.py b/bigquery_etl/view/__init__.py index 8ac6479a14e..5d39ddb705e 100644 --- a/bigquery_etl/view/__init__.py +++ b/bigquery_etl/view/__init__.py @@ -26,7 +26,8 @@ # Regex matching CREATE VIEW statement so it can be removed to get the view query CREATE_VIEW_PATTERN = re.compile( - r"CREATE\s+OR\s+REPLACE\s+VIEW\s+[^\s]+\s+AS", re.IGNORECASE + r"CREATE(?:\s+OR\s+REPLACE)?(?:\s+MATERIALIZED)?\s+VIEW(?:\s+IF\s+NOT\s+EXISTS)?\s+[^\s]+\s+AS", + re.IGNORECASE, ) diff --git a/sql_generators/stable_views/__init__.py b/sql_generators/stable_views/__init__.py index 981e300da10..213e2fc4e75 100644 --- a/sql_generators/stable_views/__init__.py +++ b/sql_generators/stable_views/__init__.py @@ -110,7 +110,8 @@ def write_view_if_not_exists(target_project: str, sql_dir: Path, schema: SchemaF from sql_generators.stable_views import VIEW_METADATA_TEMPLATE, VIEW_QUERY_TEMPLATE VIEW_CREATE_REGEX = re.compile( - r"CREATE OR REPLACE VIEW\n\s*[^\s]+\s*\nAS", re.IGNORECASE + r"CREATE(?:\s+OR\s+REPLACE)?(?:\s+MATERIALIZED)?\s+VIEW(?:\s+IF\s+NOT\s+EXISTS)?\s+[^\s]+\s+AS", + re.IGNORECASE, ) SKIP_VIEW_SCHEMA = {