diff --git a/CHANGELOG.md b/CHANGELOG.md index a1e1ceec2..2dba5af71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - Adding capability to specify compute on a per model basis ([488](https://github.com/databricks/dbt-databricks/pull/488)) - Selectively persist column docs that have changed between runs of incremental ([513](https://github.com/databricks/dbt-databricks/pull/513)) - Enabling access control list for job runs (thanks @srggrs!)([518](https://github.com/databricks/dbt-databricks/pull/518)) +- Allow persisting of column comments on views and retrieving comments for docs on Hive ([519](https://github.com/databricks/dbt-databricks/pull/519)) ## dbt-databricks 1.7.1 (Nov 13, 2023) diff --git a/dbt/adapters/databricks/column.py b/dbt/adapters/databricks/column.py index e717debd3..c0e4367c9 100644 --- a/dbt/adapters/databricks/column.py +++ b/dbt/adapters/databricks/column.py @@ -6,6 +6,7 @@ @dataclass class DatabricksColumn(SparkColumn): + table_comment: Optional[str] = None comment: Optional[str] = None TYPE_LABELS: ClassVar[Dict[str, str]] = { diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 15ab317da..0b8fdf472 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -64,6 +64,7 @@ SHOW_TABLE_EXTENDED_MACRO_NAME = "show_table_extended" SHOW_TABLES_MACRO_NAME = "show_tables" SHOW_VIEWS_MACRO_NAME = "show_views" +GET_COLUMNS_COMMENTS_MACRO_NAME = "get_columns_comments" @dataclass @@ -104,6 +105,8 @@ def get_identifier_list_string(table_names: Set[str]) -> str: @undefined_proof class DatabricksAdapter(SparkAdapter): + INFORMATION_COMMENT_REGEX = re.compile(r"Comment: (.*)\n[A-Z][A-Za-z ]+:", re.DOTALL) + Relation = DatabricksRelation Column = DatabricksColumn @@ -388,6 +391,7 @@ def parse_describe_extended( # type: ignore[override] table_type=relation.type, table_owner=str(metadata.get(KEY_TABLE_OWNER)), table_stats=table_stats, + table_comment=metadata.get("Comment"), column=column["col_name"], column_index=idx, dtype=column["data_type"], @@ -444,16 +448,24 @@ def _set_relation_information(self, relation: DatabricksRelation) -> DatabricksR return self._get_updated_relation(relation)[0] + def _get_column_comments(self, relation: DatabricksRelation) -> Dict[str, str]: + """Get the column comments for the relation.""" + columns = self.execute_macro(GET_COLUMNS_COMMENTS_MACRO_NAME, kwargs={"relation": relation}) + return {row[0]: row[2] for row in columns} + def parse_columns_from_information( # type: ignore[override] self, relation: DatabricksRelation, information: str ) -> List[DatabricksColumn]: owner_match = re.findall(self.INFORMATION_OWNER_REGEX, information) owner = owner_match[0] if owner_match else None + comment_match = re.findall(self.INFORMATION_COMMENT_REGEX, information) + table_comment = comment_match[0] if comment_match else None matches = re.finditer(self.INFORMATION_COLUMNS_REGEX, information) columns = [] stats_match = re.findall(self.INFORMATION_STATISTICS_REGEX, information) raw_table_stats = stats_match[0] if stats_match else None table_stats = DatabricksColumn.convert_table_stats(raw_table_stats) + for match_num, match in enumerate(matches): column_name, column_type, nullable = match.groups() column = DatabricksColumn( @@ -461,6 +473,7 @@ def parse_columns_from_information( # type: ignore[override] table_schema=relation.schema, table_name=relation.table, table_type=relation.type, + table_comment=table_comment, column_index=match_num, table_owner=owner, column=column_name, @@ -583,11 +596,13 @@ def _get_columns_for_catalog( # type: ignore[override] ) -> Iterable[Dict[str, Any]]: columns = self.parse_columns_from_information(relation, information) + comments = self._get_column_comments(relation) for column in columns: # convert DatabricksRelation into catalog dicts as_dict = column.to_column_dict() as_dict["column_name"] = as_dict.pop("column", None) as_dict["column_type"] = as_dict.pop("dtype") + as_dict["column_comment"] = comments[as_dict["column_name"]] yield as_dict def add_query( diff --git a/dbt/include/databricks/macros/adapters.sql b/dbt/include/databricks/macros/adapters.sql index 4010a433b..4c2848fe8 100644 --- a/dbt/include/databricks/macros/adapters.sql +++ b/dbt/include/databricks/macros/adapters.sql @@ -101,8 +101,32 @@ {%- endif -%} {%- endmacro -%} +{% macro get_column_comment_sql(column_name, column_dict) -%} + {% if column_name in column_dict and column_dict[column_name]["description"] -%} + {% set escaped_description = column_dict[column_name]["description"] | replace("'", "\\'") %} + {% set column_comment_clause = "comment '" ~ escaped_description ~ "'" %} + {%- endif -%} + {{ adapter.quote(column_name) }} {{ column_comment_clause }} +{% endmacro %} + +{% macro get_persist_docs_column_list(model_columns, query_columns) %} + {% for column_name in query_columns %} + {{ get_column_comment_sql(column_name, model_columns) }} + {{- ", " if not loop.last else "" }} + {% endfor %} +{% endmacro %} + {% macro databricks__create_view_as(relation, sql) -%} create or replace view {{ relation }} + {% if config.persist_column_docs() -%} + {% set model_columns = model.columns %} + {% set query_columns = get_columns_in_query(sql) %} + {% if query_columns %} + ( + {{ get_persist_docs_column_list(model_columns, query_columns) }} + ) + {% endif %} + {% endif %} {{ comment_clause() }} {%- set contract_config = config.get('contract') -%} {% if contract_config and contract_config.enforced %} @@ -512,4 +536,12 @@ {%- set columns_to_persist_docs = adapter.get_persist_doc_columns(existing_columns, model.columns) -%} {% do alter_column_comment(relation, columns_to_persist_docs) %} {% endif %} +{% endmacro %} + +{% macro get_columns_comments(relation) -%} + {% call statement('get_columns_comments', fetch_result=True) -%} + describe table {{ relation }} + {% endcall %} + + {% do return(load_result('get_columns_comments').table) %} {% endmacro %} \ No newline at end of file diff --git a/tests/functional/adapter/persist_docs/test_persist_docs.py b/tests/functional/adapter/persist_docs/test_persist_docs.py new file mode 100644 index 000000000..f0af80d4e --- /dev/null +++ b/tests/functional/adapter/persist_docs/test_persist_docs.py @@ -0,0 +1,67 @@ +import json +from dbt.tests.adapter.persist_docs.test_persist_docs import ( + BasePersistDocsBase, + BasePersistDocsColumnMissing, + BasePersistDocsCommentOnQuotedColumn, +) +from dbt.tests import util + +import pytest + + +class TestPersistDocs(BasePersistDocsBase): + def _assert_has_view_comments( + self, view_node, has_node_comments=True, has_column_comments=True + ): + view_comment = view_node["metadata"]["comment"] + if has_node_comments: + assert view_comment.startswith("View model description") + self._assert_common_comments(view_comment) + else: + assert view_comment == "" or view_comment is None + + view_id_comment = view_node["columns"]["id"]["comment"] + if has_column_comments: + assert view_id_comment.startswith("id Column description") + self._assert_common_comments(view_id_comment) + else: + assert view_id_comment is None + + view_name_comment = view_node["columns"]["name"]["comment"] + assert view_name_comment is None + + @pytest.fixture(scope="class") + def project_config_update(self): + return { + "models": { + "test": { + "+persist_docs": { + "relation": True, + "columns": True, + }, + } + } + } + + def test_has_comments(self, project): + util.run_dbt(["docs", "generate"]) + with open("target/catalog.json") as fp: + catalog_data = json.load(fp) + assert "nodes" in catalog_data + assert len(catalog_data["nodes"]) == 4 + table_node = catalog_data["nodes"]["model.test.table_model"] + self._assert_has_table_comments(table_node) + + view_node = catalog_data["nodes"]["model.test.view_model"] + self._assert_has_view_comments(view_node) + + no_docs_node = catalog_data["nodes"]["model.test.no_docs_model"] + self._assert_has_view_comments(no_docs_node, False, False) + + +class TestPersistDocsColumnMissing(BasePersistDocsColumnMissing): + pass + + +class TestPersistDocsCommentOnQuotedColumn(BasePersistDocsCommentOnQuotedColumn): + pass diff --git a/tests/integration/persist_docs/models/incremental_delta_model.sql b/tests/integration/persist_docs/models/incremental_delta_model.sql deleted file mode 100644 index 5d8842058..000000000 --- a/tests/integration/persist_docs/models/incremental_delta_model.sql +++ /dev/null @@ -1,2 +0,0 @@ -{{ config(materialized='incremental') }} -select 1 as id, 'Joe' as name diff --git a/tests/integration/persist_docs/models/my_fun_docs.md b/tests/integration/persist_docs/models/my_fun_docs.md deleted file mode 100644 index f3c0fbf55..000000000 --- a/tests/integration/persist_docs/models/my_fun_docs.md +++ /dev/null @@ -1,10 +0,0 @@ -{% docs my_fun_doc %} -name Column description "with double quotes" -and with 'single quotes' as welll as other; -'''abc123''' -reserved -- characters --- -/* comment */ -Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - -{% enddocs %} diff --git a/tests/integration/persist_docs/models/no_docs_model.sql b/tests/integration/persist_docs/models/no_docs_model.sql deleted file mode 100644 index e39a7a156..000000000 --- a/tests/integration/persist_docs/models/no_docs_model.sql +++ /dev/null @@ -1 +0,0 @@ -select 1 as id, 'Alice' as name diff --git a/tests/integration/persist_docs/models/schema.yml b/tests/integration/persist_docs/models/schema.yml deleted file mode 100644 index c2715ccf6..000000000 --- a/tests/integration/persist_docs/models/schema.yml +++ /dev/null @@ -1,119 +0,0 @@ -version: 2 - -models: - - - name: table_parquet_model - description: | - Table model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - name: name - description: | - Some stuff here and then a call to - {{ doc('my_fun_doc')}} - - - name: table_delta_model - description: | - Table model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - name: name - description: | - Some stuff here and then a call to - {{ doc('my_fun_doc')}} - - - name: table_hudi_model - description: | - Table model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - name: name - description: | - Some stuff here and then a call to - {{ doc('my_fun_doc')}} - - - name: view_model - description: | - View model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - - name: incremental_delta_model - description: | - Incremental model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - name: name - description: | - Some stuff here and then a call to - {{ doc('my_fun_doc')}} diff --git a/tests/integration/persist_docs/models/table_delta_model.sql b/tests/integration/persist_docs/models/table_delta_model.sql deleted file mode 100644 index c0e93c3f3..000000000 --- a/tests/integration/persist_docs/models/table_delta_model.sql +++ /dev/null @@ -1,2 +0,0 @@ -{{ config(materialized='table') }} -select 1 as id, 'Joe' as name diff --git a/tests/integration/persist_docs/models/view_model.sql b/tests/integration/persist_docs/models/view_model.sql deleted file mode 100644 index a6f96a16d..000000000 --- a/tests/integration/persist_docs/models/view_model.sql +++ /dev/null @@ -1,2 +0,0 @@ -{{ config(materialized='view') }} -select 2 as id, 'Bob' as name diff --git a/tests/integration/persist_docs/seeds/seed.csv b/tests/integration/persist_docs/seeds/seed.csv deleted file mode 100644 index 4a295177c..000000000 --- a/tests/integration/persist_docs/seeds/seed.csv +++ /dev/null @@ -1,3 +0,0 @@ -id,name -1,Alice -2,Bob \ No newline at end of file diff --git a/tests/integration/persist_docs/seeds/seeds.yml b/tests/integration/persist_docs/seeds/seeds.yml deleted file mode 100644 index 7ab82fa6b..000000000 --- a/tests/integration/persist_docs/seeds/seeds.yml +++ /dev/null @@ -1,26 +0,0 @@ -version: 2 - -seeds: - - name: seed - description: | - Seed model description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - columns: - - name: id - description: | - id Column description "with double quotes" - and with 'single quotes' as welll as other; - '''abc123''' - reserved -- characters - -- - /* comment */ - Some $lbl$ labeled $lbl$ and $$ unlabeled $$ dollar-quoting - - name: name - description: | - Some stuff here and then a call to - {{ doc('my_fun_doc')}} diff --git a/tests/integration/persist_docs/test_persist_docs.py b/tests/integration/persist_docs/test_persist_docs.py deleted file mode 100644 index 8326950c5..000000000 --- a/tests/integration/persist_docs/test_persist_docs.py +++ /dev/null @@ -1,70 +0,0 @@ -from tests.integration.base import DBTIntegrationTest, use_profile - - -class TestPersistDocsDelta(DBTIntegrationTest): - @property - def schema(self): - return "persist_docs_columns" - - @property - def models(self): - return "models" - - @property - def project_config(self): - return { - "config-version": 2, - "models": { - "test": { - "+persist_docs": { - "relation": True, - "columns": True, - }, - } - }, - "seeds": { - "test": { - "+persist_docs": { - "relation": True, - "columns": True, - }, - "+file_format": "delta", - "+quote_columns": True, - } - }, - } - - def _test_delta_comments(self): - self.run_dbt(["seed"]) - self.run_dbt(["run"]) - - for table, whatis in [ - ("table_delta_model", "Table"), - ("seed", "Seed"), - ("incremental_delta_model", "Incremental"), - ]: - results = self.run_sql( - "describe extended {database_schema}.{table}", - fetch="all", - kwargs=dict(table=table), - ) - - for result in results: - if result[0] == "Comment": - assert result[1].startswith(f"{whatis} model description") - if result[0] == "id": - assert result[2].startswith("id Column description") - if result[0] == "name": - assert result[2].startswith("Some stuff here and then a call to") - - @use_profile("databricks_cluster") - def test_delta_comments_databricks_cluster(self): - self._test_delta_comments() - - @use_profile("databricks_uc_cluster") - def test_delta_comments_databricks_uc_cluster(self): - self._test_delta_comments() - - @use_profile("databricks_uc_sql_endpoint") - def test_delta_comments_databricks_uc_sql_endpoint(self): - self._test_delta_comments() diff --git a/tests/unit/macros/test_adapters_macros.py b/tests/unit/macros/test_adapters_macros.py index 2f997cce4..c86785e07 100644 --- a/tests/unit/macros/test_adapters_macros.py +++ b/tests/unit/macros/test_adapters_macros.py @@ -1,3 +1,4 @@ +from mock import MagicMock from dbt.adapters.databricks.relation import DatabricksRelation from tests.unit.macros.base import TestMacros @@ -210,7 +211,7 @@ def test_macros_create_table_as_all_hudi(self): def test_macros_create_view_as_tblproperties(self): self.config["tblproperties"] = {"tblproperties_to_view": "true"} self.default_context["model"].alias = "my_table" - + self.default_context["get_columns_in_query"] = MagicMock(return_value=[]) sql = self._run_macro("databricks__create_view_as", "my_table", "select 1") self.assertEqual( diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index 5cef63a2c..ebfa29aa9 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -424,6 +424,7 @@ def test_parse_relation(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "col1", "column_index": 0, "dtype": "decimal(22,0)", @@ -442,6 +443,7 @@ def test_parse_relation(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "col2", "column_index": 1, "dtype": "string", @@ -460,6 +462,7 @@ def test_parse_relation(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "dt", "column_index": 2, "dtype": "date", @@ -478,6 +481,7 @@ def test_parse_relation(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "struct_col", "column_index": 3, "dtype": "struct", @@ -530,6 +534,7 @@ def test_parse_relation_with_statistics(self): ("Owner", "root", None), ("Created Time", "Wed Feb 04 18:15:00 UTC 1815", None), ("Last Access", "Wed May 20 19:25:00 UTC 1925", None), + ("Comment", "Table model description", None), ("Statistics", "1109049927 bytes, 14093476 rows", None), ("Type", "MANAGED", None), ("Provider", "delta", None), @@ -554,6 +559,7 @@ def test_parse_relation_with_statistics(self): "Owner": "root", "Created Time": "Wed Feb 04 18:15:00 UTC 1815", "Last Access": "Wed May 20 19:25:00 UTC 1925", + "Comment": "Table model description", "Statistics": "1109049927 bytes, 14093476 rows", "Type": "MANAGED", "Provider": "delta", @@ -574,6 +580,7 @@ def test_parse_relation_with_statistics(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": "Table model description", "column": "col1", "column_index": 0, "comment": "comment", @@ -643,6 +650,7 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self) "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "col1", "column_index": 0, "dtype": "decimal(22,0)", @@ -665,6 +673,7 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self) "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "struct_col", "column_index": 3, "dtype": "struct", @@ -730,6 +739,7 @@ def test_parse_columns_from_information_with_view_type(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "col2", "column_index": 1, "comment": None, @@ -748,6 +758,7 @@ def test_parse_columns_from_information_with_view_type(self): "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "struct_col", "column_index": 3, "comment": None, @@ -798,6 +809,7 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "dt", "column_index": 2, "comment": None, @@ -824,6 +836,7 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel "table_name": relation.name, "table_type": rel_type, "table_owner": "root", + "table_comment": None, "column": "struct_col", "column_index": 3, "comment": None,