From 6b44ac15b3e55ce1e3ba96ca873e8beb85a2a0fa Mon Sep 17 00:00:00 2001 From: Bentsi Leviav Date: Wed, 7 Aug 2024 21:52:20 +0300 Subject: [PATCH] Update ClickHouse doc (#5900) ## What are you changing in this pull request and why? Update ClickHouse DBT documentation. ## Checklist - [ x] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) so my content adheres to these guidelines. - [ x] For [docs versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#about-versioning), review how to [version a whole page](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) and [version a block of content](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#versioning-blocks-of-content). - [ x] Add a checklist item for anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch." --------- Co-authored-by: Matt Shaver <60105315+matthewshaver@users.noreply.github.com> --- .../connect-data-platform/clickhouse-setup.md | 119 ++++++++++-------- .../resource-configs/clickhouse-configs.md | 113 ++++++++++------- 2 files changed, 138 insertions(+), 94 deletions(-) diff --git a/website/docs/docs/core/connect-data-platform/clickhouse-setup.md b/website/docs/docs/core/connect-data-platform/clickhouse-setup.md index fce367be812..f1114f1505b 100644 --- a/website/docs/docs/core/connect-data-platform/clickhouse-setup.md +++ b/website/docs/docs/core/connect-data-platform/clickhouse-setup.md @@ -3,7 +3,7 @@ title: "ClickHouse setup" description: "Read this guide to learn about the ClickHouse warehouse setup in dbt." meta: maintained_by: Community - authors: 'Geoff Genz' + authors: 'Geoff Genz & Bentsi Leviav' github_repo: 'ClickHouse/dbt-clickhouse' pypi_package: 'dbt-clickhouse' min_core_version: 'v0.19.0' @@ -15,7 +15,8 @@ meta: config_page: '/reference/resource-configs/clickhouse-configs' --- -Some core functionality may be limited. If you're interested in contributing, check out the source code for each repository listed below. +Some core functionality may be limited. If you're interested in contributing, check out the source code for each +repository listed below. import SetUpPages from '/snippets/_setup-pages-intro.md'; @@ -23,7 +24,8 @@ import SetUpPages from '/snippets/_setup-pages-intro.md'; ## Connecting to ClickHouse with **dbt-clickhouse** -To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.getdbt.com/docs/core/connection-profiles) to your `profiles.yml` file. A ClickHouse profile conforms to the following syntax: +To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.getdbt.com/docs/core/connection-profiles) +to your `profiles.yml` file. A ClickHouse profile conforms to the following syntax: @@ -33,59 +35,74 @@ To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.g outputs: : type: clickhouse - schema: - user: - password: - #optional fields - driver: http|native - port: - host: - retries: 1 - verify: False - secure: True - connect_timeout: 10 - send_receive_timeout: 300 - sync_request_timeout: 5 - compression: False - compress_block_size: 1048576 - database_engine: - check_exchange: True - use_lw_deletes: False - custom_settings: - + schema: [ default ] # ClickHouse database for dbt models + + # optional + driver: [ http ] # http or native. If not configured, this will be auto-determined based on the port setting + host: [ localhost ] + port: [ 8123 ] # Defaults to 8123, 8443, 9000, 9440 depending on the secure and driver settings + user: [ default ] # User for all database operations + password: [ ] # Password for the user + cluster: [ ] # If configured, certain DDL/table operations will be executed with the `ON CLUSTER` clause using this cluster. Distributed materializations require this setting to work. See the following ClickHouse Cluster section for more details. + verify: [ True ] # Validate TLS certificate if using TLS/SSL + secure: [ False ] # Use TLS (native protocol) or HTTPS (http protocol) + retries: [ 1 ] # Number of times to retry a "retriable" database exception (such as a 503 'Service Unavailable' error) + compression: [ ] # Use gzip compression if truthy (http), or compression type for a native connection + connect_timeout: [ 10 ] # Timeout in seconds to establish a connection to ClickHouse + send_receive_timeout: [ 300 ] # Timeout in seconds to receive data from the ClickHouse server + cluster_mode: [ False ] # Use specific settings designed to improve operation on Replicated databases (recommended for ClickHouse Cloud) + use_lw_deletes: [ False ] # Use the strategy `delete+insert` as the default incremental strategy. + check_exchange: [ True ] # Validate that clickhouse support the atomic EXCHANGE TABLES command. (Not needed for most ClickHouse versions) + local_suffix: [ _local ] # Table suffix of local tables on shards for distributed materializations. + local_db_prefix: [ ] # Database prefix of local tables on shards for distributed materializations. If empty, it uses the same database as the distributed table. + allow_automatic_deduplication: [ False ] # Enable ClickHouse automatic deduplication for Replicated tables + tcp_keepalive: [ False ] # Native client only, specify TCP keepalive configuration. Specify custom keepalive settings as [idle_time_sec, interval_sec, probes]. + custom_settings: [ { } ] # A dictionary/mapping of custom ClickHouse settings for the connection - default is empty. + + # Native (clickhouse-driver) connection settings + sync_request_timeout: [ 5 ] # Timeout for server ping + compress_block_size: [ 1048576 ] # Compression block size if compression is enabled + + ``` -#### Description of ClickHouse Profile Fields - - -| Field | Description | -|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `type` | This must be included either in `profiles.yml` or in the `dbt_project.yml` file. Must be set to `clickhouse`. | -| `schema` | Required. A ClickHouse's database name. The dbt model database.schema.table is not compatible with ClickHouse because ClickHouse does not support a schema. So we use a simple model schema.table, where schema is the ClickHouse's database. We don't recommend using the `default` database. | -| `user` | Required. A ClickHouse username with adequate permissions to access the specified `schema`. | -| `password` | Required. The password associated with the specified `user`. | -| `driver` | Optional. The ClickHouse client interface, `http` or `native`. Defaults to `http` unless the `port` is set to 9440 or 9400, in which case the `native` driver is assumed. | -| `port` | Optional. ClickHouse server port number. Defaults to 8123/8443 (secure) if the driver is `http`, and to 9000/9440(secure) if the driver is `native`. | -| `host` | Optional. The host name of the connection. Default is `localhost`. | -| `retries` | Optional. Number of times to retry the initial connection attempt if the error appears to be recoverable. | -| `verify` | Optional. For (`secure=True`) connections, validate the ClickHouse server TLS certificate, including matching hostname, expiration, and signed by a trusted Certificate Authority. Defaults to True. | -| `secure` | Optional. Whether the connection (either http or native) is secured by TLS. This converts an http driver connection to https, and a native driver connection to the native ClickHouse protocol over TLS. the Defaults to False. | -| `cluster_mode` | Optional. Add connection settings to improve compatibility with clusters using the Replicated Database Engine. Default False. | -| `connect_timeout` | Optional. Connection timeout in seconds. Defaults is 10 seconds. | -| `send_receive_timeout` | Optional. Timeout for receiving data from or sending data to ClickHouse. Defaults to 5 minutes (300 seconds) | -| `sync_request_timeout` | Optional. Timeout for connection ping request (native connection only). Defaults to 5 seconds. | -| `compression` | Optional. Use compression in the connection. Defaults to `False`. If set to `True` for HTTP, this enables gzip compression. If set to `True` for the native protocol, this enabled lz4 compression. Other valid values are `lz4hc` and `zstd` for the native driver only. | -| `compress_block_size` | Optional. Compression block size (in bytes) when using compression with the native driver. Defaults to 1MB | -| `database_engine` | Optional. Database engine to use when creating new ClickHouse schemas (databases). If not set (the default), new databases will use the default ClickHouse database engine (usually Atomic). | -| `check_exchange` | Optional. On connecting to the ClickHouse, if this is parameter is `True` DBT will validate that the ClickHouse server supports atomic exchange of tables. Using atomic exchange (when available) improves reliability and parallelism. This check is unnecessary for ClickHouse running on recent Linux operating system, and in those circumstances can be disabled by setting `check_exchange` to `False` to avoid additional overhead on startup. Defaults to `True`. | -| `use_lw_deletes` | Optional. If ClickHouse experimental lightweight deletes are available, use the `delete+insert` strategy as the default strategy for incremental materializations. Defaults to `False` (use legacy strategy). | -| `custom_settings` | Optional. A mapping of ClickHouse specific user settings to use with the connection. See the ClickHouse documentation for supported settings. | - -#### Troubleshooting Connections +### Description of ClickHouse Profile Fields + +| Field | Description | +|---------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `type` | This must be included either in `profiles.yml` or in the `dbt_project.yml` file. Must be set to `clickhouse`. | +| `schema` | Required. A ClickHouse's database name. The dbt model database.schema.table is not compatible with ClickHouse because ClickHouse does not support a schema. So we use a simple model schema.table, where schema is the ClickHouse's database. We don't recommend using the `default` database. | +| `driver` | Optional. The ClickHouse client interface, `http` or `native`. Defaults to `http` unless the `port` is set to 9440 or 9400, in which case the `native` driver is assumed. | +| `host` | Optional. The host name of the connection. Default is `localhost`. | +| `port` | Optional. ClickHouse server port number. Defaults to 8123/8443 (secure) if the driver is `http`, and to 9000/9440(secure) if the driver is `native`. | +| `user` | Required. A ClickHouse username with adequate permissions to access the specified `schema`. | +| `password` | Required. The password associated with the specified `user`. | +| `cluster` | Optional. If set, certain DDL/table operations will be executed with the `ON CLUSTER` clause using this cluster. Distributed materializations require this setting to work. See the following ClickHouse Cluster section for more details. | +| `verify` | Optional. For (`secure=True`) connections, validate the ClickHouse server TLS certificate, including matching hostname, expiration, and signed by a trusted Certificate Authority. Defaults to True. | +| `secure` | Optional. Whether the connection (either http or native) is secured by TLS. This converts an http driver connection to https, and a native driver connection to the native ClickHouse protocol over TLS. the Defaults to False. | +| `retries` | Optional. Number of times to retry the initial connection attempt if the error appears to be recoverable. | +| `compression` | Optional. Use compression in the connection. Defaults to `False`. If set to `True` for HTTP, this enables gzip compression. If set to `True` for the native protocol, this enabled lz4 compression. Other valid values are `lz4hc` and `zstd` for the native driver only. | +| `connect_timeout` | Optional. Connection timeout in seconds. Defaults is 10 seconds. | +| `send_receive_timeout` | Optional. Timeout for receiving data from or sending data to ClickHouse. Defaults to 5 minutes (300 seconds) | +| `cluster_mode` | Optional. Add connection settings to improve compatibility with clusters using the Replicated Database Engine. Default False. | +| `use_lw_deletes` | Optional. If ClickHouse experimental lightweight deletes are available, use the `delete+insert` strategy as the default strategy for incremental materializations. Defaults to `False` (use legacy strategy). | +| `check_exchange` | Optional. On connecting to the ClickHouse, if this is parameter is `True` DBT will validate that the ClickHouse server supports atomic exchange of tables. Using atomic exchange (when available) improves reliability and parallelism. This check is unnecessary for ClickHouse running on recent Linux operating system, and in those circumstances can be disabled by setting `check_exchange` to `False` to avoid additional overhead on startup. Defaults to `True`. | +| `local_suffix` | Optional. Table suffix of local tables on shards for distributed materializations. Defaults to '_local'. | +| `local_db_prefix` | Optional. Database prefix of local tables on shards for distributed materializations. If empty, it uses the same database as the distributed table. Defaults to empty string. | +| `allow_automatic_deduplication` | Optional. Enable ClickHouse automatic deduplication for Replicated tables. Defaults to False. | +| `tcp_keepalive` | Optional. Native client only, specify TCP keepalive configuration. Specify custom keepalive settings as `idle_time_sec`, `interval_sec`, `probes`. Defaults to False. | +| `sync_request_timeout` | Optional. Timeout for connection ping request (native connection only). Defaults to 5 seconds. | +| `compress_block_size` | Optional. Compression block size (in bytes) when using compression with the native driver. Defaults to 1MB | +| `database_engine` | Optional. Database engine to use when creating new ClickHouse schemas (databases). If not set (the default), new databases will use the default ClickHouse database engine (usually Atomic). | +| `custom_settings` | Optional. A mapping of ClickHouse specific user settings to use with the connection. See the ClickHouse documentation for supported settings. | + +## Troubleshooting Connections If you encounter issues connecting to ClickHouse from dbt, make sure the following criteria are met: + - The engine must be one of the [supported engines](/reference/resource-configs/clickhouse-configs#supported-table-engines). - You must have adequate permissions to access the database. -- If you're not using the default table engine for the database, you must specify a table engine in your model configuration. +- If you're not using the default table engine for the database, you must specify a table engine in your model + configuration. diff --git a/website/docs/reference/resource-configs/clickhouse-configs.md b/website/docs/reference/resource-configs/clickhouse-configs.md index 32f0c81f664..02096bb8f23 100644 --- a/website/docs/reference/resource-configs/clickhouse-configs.md +++ b/website/docs/reference/resource-configs/clickhouse-configs.md @@ -13,17 +13,28 @@ id: "clickhouse-configs" | incremental materialization | YES | Creates a table if it doesn't exist, and then writes only updates to it. | | ephemeral materialized | YES | Creates a ephemeral/CTE materialization. This does model is internal to dbt and does not create any database objects | -### View Materialization +## Experimental models +The following are [experimental features](https://clickhouse.com/docs/en/beta-and-experimental-features) in Clickhouse: -A dbt model can be created as a [ClickHouse view](https://clickhouse.com/docs/en/sql-reference/table-functions/view/) and configured using the following syntax: +| Type | Supported? | Details | +|-----------------------------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Materialized View materialization | YES, Experimental | Creates a [materialized view](https://clickhouse.com/docs/en/materialized-view). | +| Distributed table materialization | YES, Experimental | Creates a [distributed table](https://clickhouse.com/docs/en/engines/table-engines/special/distributed). | +| Distributed incremental materialization | YES, Experimental | Incremental model based on the same idea as distributed table. Note that not all strategies are supported, visit [this](https://github.com/ClickHouse/dbt-clickhouse?tab=readme-ov-file#distributed-incremental-materialization) for more info. | +| Dictionary materialization | YES, Experimental | Creates a [dictionary](https://clickhouse.com/docs/en/engines/table-engines/special/dictionary). | + +### View materialization + +A dbt model can be created as a [ClickHouse view](https://clickhouse.com/docs/en/sql-reference/table-functions/view/) +and configured using the following syntax: @@ -49,17 +60,18 @@ models: -### Table Materialization +### Table materialization -A dbt model can be created as a [ClickHouse table](https://clickhouse.com/docs/en/operations/system-tables/tables/) and configured using the following syntax: +A dbt model can be created as a [ClickHouse table](https://clickhouse.com/docs/en/operations/system-tables/tables/) and +configured using the following syntax: @@ -95,7 +107,7 @@ models: -#### Table Configuration +#### Table configuration | Option | Description | Required? | |----------------|------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------| @@ -104,17 +116,19 @@ models: | `order_by` | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | | `partition_by` | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | -### Incremental Materialization +### Incremental materialization -Table model will be reconstructed for each dbt execution. This may be infeasible and extremely costly for larger result sets or complex transformations. To address this challenge and reduce the build time, a dbt model can be created as an incremental ClickHouse table and is configured using the following syntax: +Table model will be reconstructed for each dbt execution. This may be infeasible and extremely costly for larger result +sets or complex transformations. To address this challenge and reduce the build time, a dbt model can be created as an +incremental ClickHouse table and is configured using the following syntax: @@ -154,22 +168,24 @@ models: -#### Incremental Table Configuration +#### Incremental table configuration -| Option | Description | Required? | -|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| `materialized` | How the model will be materialized into ClickHouse. Must be `table` to create a table model. | Required | -| `unique_key` | A tuple of column names that uniquely identify rows. For more details on uniqueness constraints, see [here](/docs/build/incremental-models#defining-a-unique-key-optional). | Required. If not provided altered rows will be added twice to the incremental table. | -| `engine` | The table engine to use when creating tables. See list of supported engines below. | Optional (default: `MergeTree()`) | -| `order_by` | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | -| `partition_by` | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | -| `inserts_only` | (Deprecated, see the `append` materialization strategy). If True, incremental updates will be inserted directly to the target incremental table without creating an intermediate table. | Optional (default: `False`) | -| `incremental_strategy` | The strategy to use for incremental materialization. `delete+insert` and `append` are supported. For additional details on strategies, see [here](https://github.com/ClickHouse/dbt-clickhouse#incremental-model-strategies) | Optional (default: 'default') | -| `incremental_predicates` | Incremental predicate clause to be applied to `delete+insert` materializations | Optional | +| Option | Description | Required? | +|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| +| `materialized` | How the model will be materialized into ClickHouse. Must be `table` to create a table model. | Required | +| `unique_key` | A tuple of column names that uniquely identify rows. For more details on uniqueness constraints, see [here](/docs/build/incremental-models#defining-a-unique-key-optional). | Required. If not provided altered rows will be added twice to the incremental table. | +| `engine` | The table engine to use when creating tables. See list of supported engines below. | Optional (default: `MergeTree()`) | +| `order_by` | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | +| `partition_by` | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | +| `inserts_only` | (Deprecated, see the `append` materialization strategy). If True, incremental updates will be inserted directly to the target incremental table without creating an intermediate table. | Optional (default: `False`) | +| `incremental_strategy` | The strategy to use for incremental materialization. `delete+insert`, `append` and `insert_overwrite` (experimental) are supported. For additional details on strategies, see [here](https://github.com/ClickHouse/dbt-clickhouse#incremental-model-strategies) | Optional (default: 'default') | +| `incremental_predicates` | Incremental predicate clause to be applied to `delete+insert` materializations | Optional | ## Snapshot -dbt snapshots allow a record to be made of changes to a mutable model over time. This in turn allows point-in-time queries on models, where analysts can “look back in time” at the previous state of a model. This functionality is supported by the ClickHouse connector and is configured using the following syntax: +dbt snapshots allow a record to be made of changes to a mutable model over time. This in turn allows point-in-time +queries on models, where analysts can “look back in time” at the previous state of a model. This functionality is +supported by the ClickHouse connector and is configured using the following syntax: @@ -211,7 +227,7 @@ dbt snapshots allow a record to be made of changes to a mutable model over time. For more information on configuration, check out the [snapshot configs](/reference/snapshot-configs) reference page. -## Supported Table Engines +## Supported table engines | Type | Details | |------------------------|-------------------------------------------------------------------------------------------| @@ -222,14 +238,25 @@ For more information on configuration, check out the [snapshot configs](/referen | EmbeddedRocksDB | https://clickhouse.com/docs/en/engines/table-engines/integrations/embedded-rocksdb | | Hive | https://clickhouse.com/docs/en/engines/table-engines/integrations/hive | -If you encounter issues connecting to ClickHouse from dbt with one of the above engines, please report an issue [here](https://github.com/ClickHouse/dbt-clickhouse/issues). +## Experimental supported table engines -## Cross Database Macro Support +| Type | Details | +|-------------------|---------------------------------------------------------------------------| +| Distributed Table | https://clickhouse.com/docs/en/engines/table-engines/special/distributed. | +| Dictionary | https://clickhouse.com/docs/en/engines/table-engines/special/dictionary | + +If you encounter issues connecting to ClickHouse from dbt with one of the above engines, please report an +issue [here](https://github.com/ClickHouse/dbt-clickhouse/issues). + +## Cross database macro support dbt-clickhouse supports most of the cross database macros now included in dbt-core, with the following exceptions: -* The `listagg` SQL function (and therefore the corresponding dbt macro) is not supported by ClickHouse. You can achieve similar results with the ClickHouse `groupArray` function but in some cases subqueries may be required to achieve the desired ordering. -* The `split_part` SQL function is implemented in ClickHouse using the splitByChar function. This function requires using a constant string for the "split" delimiter, so the `delimeter` parameter used for this macro will be interpreted as a string, not a column name -* Similarly, the `replace` SQL function in ClickHouse requires constant strings for the `old_chars` and `new_chars` parameters, so those parameters will be interpreted as strings rather than column names when invoking this macro. + +* The `split_part` SQL function is implemented in ClickHouse using the splitByChar function. This function requires + using a constant string for the "split" delimiter, so the `delimeter` parameter used for this macro will be + interpreted as a string, not a column name +* Similarly, the `replace` SQL function in ClickHouse requires constant strings for the `old_chars` and `new_chars` + parameters, so those parameters will be interpreted as strings rather than column names when invoking this macro. ## Setting `quote_columns`