diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fad32a7e224..5e5371c4589 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,27 +1,20 @@ # Lower entries on this list take precedence # Anything unmatched by selectors below -* @runleonarun - -# Cloud docs and images are assigned to Leona -/website/docs/docs/dbt-cloud/dbt-cloud-changelog.md @runleonarun -/website/docs/docs/dbt-cloud/ @runleonarun -/website/src/pages/dbt-cloud/ @runleonarun -/website/static/img/docs/dbt-cloud/ @runleonarun - -# Blog content assigned to Developer experience team -/website/blog/ @runleonarun +* @dbt-labs/product-docs +# The blog +/website/blog @KiraFuruichi # Adapter & Package Development Docs -/website/docs/docs/supported-data-platforms.md/ @runleonarun @dataders -/website/docs/reference/warehouse-profiles/ @runleonarun @dataders -/website/docs/reference/resource-configs/ @runleonarun @dataders -/website/docs/guides/building-packages @runleonarun @amychen1776 @dataders @dbeatty10 -/website/docs/contributing/building-a-new-adapter @runleonarun @dataders @dbeatty10 -/website/docs/contributing/testing-a-new-adapter @runleonarun @dataders @dbeatty10 -/website/docs/guides/creating-new-materializations @runleonarun @dataders @dbeatty10 -/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/ @runleonarun @dataders @dbeatty10 +/website/docs/docs/supported-data-platforms.md @dbt-labs/product-docs @dataders +/website/docs/reference/warehouse-setups @dbt-labs/product-docs @dataders +# `resource-configs` contains more than just warehouse setups +/website/docs/reference/resource-configs/*-configs.md @dbt-labs/product-docs @dataders +/website/docs/guides/advanced/adapter-development @dbt-labs/product-docs @dataders @dbeatty10 + +/website/docs/guides/building-packages @dbt-labs/product-docs @amychen1776 @dataders @dbeatty10 +/website/docs/guides/creating-new-materializations @dbt-labs/product-docs @dataders @dbeatty10 diff --git a/.github/ISSUE_TEMPLATE/improve-docs.yml b/.github/ISSUE_TEMPLATE/a-improve-docs.yml similarity index 94% rename from .github/ISSUE_TEMPLATE/improve-docs.yml rename to .github/ISSUE_TEMPLATE/a-improve-docs.yml index 57dc64cc312..c9030bc227b 100644 --- a/.github/ISSUE_TEMPLATE/improve-docs.yml +++ b/.github/ISSUE_TEMPLATE/a-improve-docs.yml @@ -5,7 +5,7 @@ body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Add any other context or screenshots about the feature request here. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9349000f66b..f3a3521bdec 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,5 @@ blank_issues_enabled: true contact_links: - - name: Want to see new content? Open a discussion! - url: https://github.com/dbt-labs/docs.getdbt.com/discussions/new - about: You can open a discussion to propose new content for the dbt product documentation. - name: Have questions about dbt? Join the Community! url: https://www.getdbt.com/community/join-the-community about: You can join the dbt Labs Community to ask and answer questions. diff --git a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml index f138b9e4e06..037da98dc6f 100644 --- a/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml +++ b/.github/ISSUE_TEMPLATE/contribute-to-developer-blog.yml @@ -1,4 +1,4 @@ -name: Contribute to the dbt Developer Blog +name: Propose a dbt Developer Blog idea description: > For proposing a new post on the dbt Developer Blog. labels: ["content","developer blog"] diff --git a/.github/ISSUE_TEMPLATE/improve-the-site.yml b/.github/ISSUE_TEMPLATE/improve-the-site.yml index e0556d7374f..01ebdea711a 100644 --- a/.github/ISSUE_TEMPLATE/improve-the-site.yml +++ b/.github/ISSUE_TEMPLATE/improve-the-site.yml @@ -1,11 +1,11 @@ -name: Improve the docs.getdbt.com site -description: Make a suggestion or report a problem about the technical implementation of docs.getdbt.com. -labels: ["engineering"] +name: Report a docs.getdbt.com site issue +description: Report a problem about the technical implementation of docs.getdbt.com. +labels: ["engineering","bug"] body: - type: markdown attributes: value: | - * You can ask questions or submit ideas for the dbt docs in [Discussions](https://github.com/dbt-labs/docs.getdbt.com/discussions) + * You can ask questions or submit ideas for the dbt docs in [Issues](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). @@ -39,4 +39,4 @@ body: label: Additional information description: Any additional information, configuration, or data that might be necessary to reproduce the issue. validations: - required: false \ No newline at end of file + required: false diff --git a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml b/.github/ISSUE_TEMPLATE/new-dbt-feature.yml deleted file mode 100644 index fa46a189fc4..00000000000 --- a/.github/ISSUE_TEMPLATE/new-dbt-feature.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Start docs project for a new feature -description: For dbt PMs to add docs for their new or updated dbt product features. -labels: ["content","upcoming release"] -body: - - type: markdown - attributes: - value: | - * Before you file an issue read the [Contributing guide](https://github.com/dbt-labs/docs.getdbt.com#contributing). - * Check to make sure someone hasn't already opened a similar [issue](https://github.com/dbt-labs/docs.getdbt.com/issues). - - - type: checkboxes - id: contributions - attributes: - label: Contributions - description: This applies to new, unreleased content. - options: - - label: I am a PM or subject matter expert at dbt who is responsible for this feature. - - - type: textarea - attributes: - label: Where does this content belong? - description: | - - Give as much detail as you can to help us understand where you expect the content to live. - validations: - required: true - - - type: textarea - attributes: - label: Link to source material - description: | - Use the [source material template](https://docs.google.com/document/d/1lLWGMXJFjkY4p7r8ZKhBX73dOLmIjgXZBYq39LqmAJs/edit) to provide source material for this feature. - validations: - required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml new file mode 100644 index 00000000000..e19accf6ebb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/zzz_add-adapter-to-trusted-list.yml @@ -0,0 +1,62 @@ +name: Add adapter to Trusted list +description: For adapter maintainers who wish to have theirs added to the list of Trusted adapters. +title: "Trust dbt-myadapter" +labels: ["adapter maintainers"] +assignees: + - dataders +body: + - type: markdown + attributes: + value: | + We're excited that you'd like to support your adapter formally as "Trusted"! This template will ensure that you are aware of the process and the guidelines. Additionally, that you can vouch that your adapter currently meets the standards of a Trusted adapter. For more information, see [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) + + - type: input + id: adapter-repo + attributes: + label: Link to adapter repo + description: Please link to the GitHub repo + validations: + required: true + + - type: input + id: contact + attributes: + label: Contact Details + description: How can we get in touch with you? + placeholder: your preferred email and/or dbt Slack handle + validations: + required: true + + - type: dropdown + id: author_type + attributes: + label: Which of these best describes you? + options: + - I am a dbt Community member + - I work for the vendor on top of which the dbt adapter functions + validations: + required: true + + - type: checkboxes + id: read-program-guide + attributes: + label: Please agree to the each of the following + options: + - label: I am a maintainer of the adapter being submited for Trusted status + required: true + - label: I have read both the [Trusted adapters](https://docs.getdbt.com/docs/trusted-adapters) and [Building a Trusted Adapter](https://docs.getdbt.com/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) pages. + required: true + - label: I believe that the adapter currently meets the expectations given above + required: true + - label: I will ensure this adapter stays in compliance with the guidelines + required: true + - label: I understand that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met + required: true + + - type: textarea + id: icon + attributes: + label: What icon should be used? + description: | + Please share an svg image that you'd like to be displayed in for your adapter. Normally, this is the logo for the data platform on top of which your adapter works. If there's a dark mode version, please also share that. + Pasting the image from your clipboard will upload the file to GitHub and create markdown formatting for it to be rendered inline diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 00000000000..f32a8ec1de5 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,14 @@ +# Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome + +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + Hello!👋 Thanks for contributing to the dbt product documentation and opening this pull request! ✨ + + We use Markdown and some HTML to write the dbt product documentation. When writing content, you can use our [style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) and [content types](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-types.md) to understand our writing standards and how we organize information in the dbt product docs. + + We'll review your contribution and respond as soon as we can. 😄 + + + + + diff --git a/.github/labeler.yml b/.github/labeler.yml index 595db45cd66..176f1874009 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -2,7 +2,9 @@ developer blog: - website/blog/**/* guides: -- website/docs/docs/guides/**/* +- website/docs/guides/**/* content: - website/docs/**/* +- website/snippets/**/* +- website/static/**/* diff --git a/.github/lychee.toml b/.github/lychee.toml new file mode 100644 index 00000000000..2d9f0185496 --- /dev/null +++ b/.github/lychee.toml @@ -0,0 +1,54 @@ +### +### Display +### + +# Verbose program output +# Accepts log level: "error", "warn", "info", "debug", "trace" +verbose = "info" + +# Don't show interactive progress bar while checking links. +no_progress = true + +### +### Cache +### + +# Enable link caching. This can be helpful to avoid checking the same links on +# multiple runs. +cache = false + +### +### Requests +### + +# Website timeout from connect to response finished. +timeout = 30 + +# Comma-separated list of accepted status codes for valid links. +accept = [200, 403, 429] + +# Base URL or website root directory to check relative URLs. +base = "https://docs.getdbt.com" + +### +### Exclusions +### + +# Exclude URLs from checking (supports regex) +exclude = [ + 'frontMatter.', + 'https://badge.fury.io', + 'https://img.shields.io', + 'https://gitlab.com', + 'https://dbtlabs.postman.co', + 'https://mobile.twitter.com', + 'https://twitter.com', + 'https://www.twitter.com', +] + +# Exclude all mail addresses from checking +exclude_mail = true + +# Check links inside `` and `
` blocks as well as Markdown code
+# blocks.
+include_verbatim = false
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 5ae7e9f4d6d..90f4938d2cb 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,30 +1,25 @@
-## Description & motivation
+## What are you changing in this pull request and why?
 
 
-## To-do before merge
-
+- [ ] Review the [Content style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) and [About versioning](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/single-sourcing-content.md#adding-a-new-version) so my content adheres to these guidelines.
+- [ ] Add a checklist item for anything that needs to happen before this PR is merged, such as "needs technical review" or "change base branch."
 
-## Prerelease docs
-If this change is related to functionality in a prerelease version of dbt (delete if not applicable):
-- [ ] I've added versioning components, as described in ["Versioning Docs"](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/versioningdocs.md)
-- [ ] I've added a note to the prerelease version's [Migration Guide](https://github.com/dbt-labs/docs.getdbt.com/tree/current/website/docs/guides/migration/versions)
-
-## Checklist
-If you added new pages (delete if not applicable):
-- [ ] The page has been added to `website/sidebars.js`
-- [ ] The new page has a unique filename
+Adding new pages (delete if not applicable):
+- [ ] Add page to `website/sidebars.js`
+- [ ] Provide a unique filename for the new page
 
-If you removed existing pages (delete if not applicable):
-- [ ] The page has been removed from `website/sidebars.js`
-- [ ] An entry has been added to `_redirects`
-- [ ] [Run link testing](https://github.com/dbt-labs/docs.getdbt.com#running-the-cypress-tests-locally) to update the links that point to the deleted page
+Removing or renaming existing pages (delete if not applicable):
+- [ ] Remove page from `website/sidebars.js`
+- [ ] Add an entry `website/static/_redirects`
+- [ ] [Ran link testing](https://github.com/dbt-labs/docs.getdbt.com#running-the-cypress-tests-locally) to update the links that point to the deleted page
diff --git a/.github/teams.yml b/.github/teams.yml
new file mode 100644
index 00000000000..1dd01020578
--- /dev/null
+++ b/.github/teams.yml
@@ -0,0 +1,5 @@
+Docs team:
+  - '@runleonarun'
+  - '@mirnawong1'
+  - '@nghi-ly'
+  - '@matthewshaver'
diff --git a/.github/workflows/add_issue_to_project.yml b/.github/workflows/add_issue_to_project.yml
index d45a171f337..38a9eedc816 100644
--- a/.github/workflows/add_issue_to_project.yml
+++ b/.github/workflows/add_issue_to_project.yml
@@ -1,123 +1,15 @@
 name: Add Issue to Project
+
 on:
   issues:
     types: [opened, reopened]
 
 jobs:
-  track_issue:
+  add-to-project:
+    name: Add issue to project
     runs-on: ubuntu-latest
-
     steps:
-      - uses: octokit/graphql-action@v2.x
-        name: Get project data
-        id: get_project_info          
-        with:
-          query: |
-            query($org: String!, $number: Int!) {
-              organization(login: $org){
-                projectNext(number: $number) {
-                  id
-                  fields(first:20) {
-                    nodes {
-                      id
-                      name
-                      settings
-                    }
-                  }
-                }
-              }
-            }
-          org: 'dbt-labs'
-          number: 14
-          headers: 'GraphQL-Features: projects_next_graphql'
-        env:
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-
-      - uses: octokit/graphql-action@v2.x
-        name: Add Issue to Project
-        id: add_issue
-        with:
-          query: |
-            mutation($project:ID!, $issue:ID!) {
-                addProjectNextItem(input: {projectId: $project, contentId: $issue}) {
-                  projectNextItem {
-                    id
-                  }
-                }
-              }
-          project: ${{ fromJSON(steps.get_project_info.outputs.data).organization.projectNext.id }}
-          issue: ${{ github.event.issue.node_id}}
-          headers: 'GraphQL-Features: projects_next_graphql'
-        env:
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-
-      - name: create-json
-        id: create-json
-        uses: jsdaniell/create-json@1.1.2
-        with:
-          name: "data.json"
-          json: ${{ steps.get_project_info.outputs.data }}
-    
-      - name: Extract date field ID
-        uses: sergeysova/jq-action@v2
-        id: date_field_id
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name == "Date") | .id' data.json
-
-      - name: Extract status field ID
-        uses: sergeysova/jq-action@v2
-        id: status_field_id
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name == "Status") | .id' data.json
-
-      - name: Extract status field value
-        uses: sergeysova/jq-action@v2
-        id: status_field_value
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name== "Status") |.settings | fromjson.options[] | select(.name=="Triage") |.id' data.json
-      
-      - name: Set fields
-        id: set_fields
-        uses: octokit/graphql-action@v2.x  
+      - uses: actions/add-to-project@v0.5.0
         with:
-          query: |
-            mutation (
-              $project: ID!
-              $item: ID!
-              $status_field: ID!
-              $status_value: String!
-              $date_field: ID!
-              $date_value: String!
-            ) {
-              set_status: updateProjectNextItemField(input: {
-                projectId: $project
-                itemId: $item
-                fieldId: $status_field
-                value: $status_value
-              }) {
-                projectNextItem {
-                  id
-                  }
-              }
-              set_date_posted: updateProjectNextItemField(input: {
-                projectId: $project
-                itemId: $item
-                fieldId: $date_field
-                value: $date_value
-              }) {
-                projectNextItem {
-                  id
-                }
-              }
-            }
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-          project: ${{ fromJSON(steps.get_project_info.outputs.data).organization.projectNext.id }}
-          headers: 'GraphQL-Features: projects_next_graphql'
-          item: ${{ fromJSON(steps.add_issue.outputs.data).addProjectNextItem.projectNextItem.id }}
-          status_field: ${{ steps.status_field_id.outputs.value }}
-          status_value: ${{ steps.status_field_value.outputs.value }}
-          date_field: ${{ steps.date_field_id.outputs.value }}
-          date_value: ${{ github.event.issue.created_at }}
+          project-url: https://github.com/orgs/dbt-labs/projects/14 
+          github-token: ${{ secrets.DOCS_SECRET }}
diff --git a/.github/workflows/add_pr_to_project.yml b/.github/workflows/add_pr_to_project.yml
index a08840f98d0..d238271e590 100644
--- a/.github/workflows/add_pr_to_project.yml
+++ b/.github/workflows/add_pr_to_project.yml
@@ -1,123 +1,15 @@
-name: Add PR to project
+name: Add to projects
+
 on:
   pull_request_target:
-    types: [opened,reopened]
+   types: [opened, reopened]
 
 jobs:
-  track_pr:
+  add-to-project:
+    name: Add pr to project
     runs-on: ubuntu-latest
-
     steps:
-      - uses: octokit/graphql-action@v2.x
-        name: Get project data
-        id: get_project_info          
-        with:
-          query: |
-            query($org: String!, $number: Int!) {
-              organization(login: $org){
-                projectNext(number: $number) {
-                  id
-                  fields(first:20) {
-                    nodes {
-                      id
-                      name
-                      settings
-                    }
-                  }
-                }
-              }
-            }
-          org: 'dbt-labs'
-          number: 14
-          headers: 'GraphQL-Features: projects_next_graphql'
-        env:
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-
-      - uses: octokit/graphql-action@v2.x
-        name: Add PR to Project
-        id: add_pr
-        with:
-          query: |
-            mutation($project:ID!, $pr:ID!) {
-                addProjectNextItem(input: {projectId: $project, contentId: $pr}) {
-                  projectNextItem {
-                    id
-                  }
-                }
-              }
-          project: ${{ fromJSON(steps.get_project_info.outputs.data).organization.projectNext.id }}
-          pr: ${{ github.event.pull_request.node_id}}
-          headers: 'GraphQL-Features: projects_next_graphql'
-        env:
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-
-      - name: create-json
-        id: create-json
-        uses: jsdaniell/create-json@1.1.2
-        with:
-          name: "data.json"
-          json: ${{ steps.get_project_info.outputs.data }}
-
-      - name: Extract date field ID
-        uses: sergeysova/jq-action@v2
-        id: date_field_id
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name == "Date") | .id' data.json
-
-      - name: Extract status field ID
-        uses: sergeysova/jq-action@v2
-        id: status_field_id
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name == "Status") | .id' data.json
-
-      - name: Extract status field value
-        uses: sergeysova/jq-action@v2
-        id: status_field_value
-        with:
-          cmd: 
-            jq '.organization.projectNext.fields.nodes[] | select(.name== "Status") |.settings | fromjson.options[] | select(.name=="Triage") |.id' data.json
-
-      - name: Set fields
-        id: set_fields
-        uses: octokit/graphql-action@v2.x  
+      - uses: actions/add-to-project@v0.5.0
         with:
-          query: |
-            mutation (
-              $project: ID!
-              $item: ID!
-              $status_field: ID!
-              $status_value: String!
-              $date_field: ID!
-              $date_value: String!
-            ) {
-              set_status: updateProjectNextItemField(input: {
-                projectId: $project
-                itemId: $item
-                fieldId: $status_field
-                value: $status_value
-              }) {
-                projectNextItem {
-                  id
-                  }
-              }
-              set_date_posted: updateProjectNextItemField(input: {
-                projectId: $project
-                itemId: $item
-                fieldId: $date_field
-                value: $date_value
-              }) {
-                projectNextItem {
-                  id
-                }
-              }
-            }
-          GITHUB_TOKEN: ${{ secrets.PROJECT_NEXT }}
-          project: ${{ fromJSON(steps.get_project_info.outputs.data).organization.projectNext.id }}
-          headers: 'GraphQL-Features: projects_next_graphql'
-          item: ${{ fromJSON(steps.add_pr.outputs.data).addProjectNextItem.projectNextItem.id }}
-          status_field: ${{ steps.status_field_id.outputs.value }}
-          status_value: ${{ steps.status_field_value.outputs.value }}
-          date_field: ${{ steps.date_field_id.outputs.value }}
-          date_value: ${{ github.event.pull_request.created_at }}
+          project-url: https://github.com/orgs/dbt-labs/projects/37
+          github-token: ${{ secrets.DOCS_SECRET }}
diff --git a/.github/workflows/autogenerated_labeler.yml b/.github/workflows/autogenerated_labeler.yml
new file mode 100644
index 00000000000..e6aab0492b8
--- /dev/null
+++ b/.github/workflows/autogenerated_labeler.yml
@@ -0,0 +1,40 @@
+# **what?**
+# Labels issues autogenerated in dbt-core
+
+# **why?**
+# To organize autogenerated issues from dbt-core to make it easier to find and track them.
+
+# **when?**
+# When an issue is opened by the FishtownBuildBot
+
+name: Add Labels to Autogenerated Issues
+
+on:
+  issues:
+    types: [opened]
+    
+jobs:
+  add_customized_labels:
+    if: github.event.issue.user.login == 'FishtownBuildBot'
+    permissions:
+      issues: write
+
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Determine appropriate labels by repo in title"
+        id: repo
+        env:
+          ISSUE_TITLE: ${{ github.event.issue.title }}
+        run: |
+          if [[ "$ISSUE_TITLE" == *"dbt-core"* ]]; then
+            echo "labels='content,improvement,dbt Core'" >> $GITHUB_OUTPUT
+          else
+            echo "labels='content,improvement,adapters'" >> $GITHUB_OUTPUT
+          fi
+
+      - name: "Add Labels to autogenerated Issues"
+        id: add-labels
+        run: |
+          gh issue edit ${{ github.event.issue.number }} --repo ${{ github.repository }} --add-label ${{ steps.repo.outputs.labels }}
+        env:
+          GH_TOKEN: ${{ secrets.DOCS_SECRET }}
diff --git a/.github/workflows/autoupdate.yml b/.github/workflows/autoupdate.yml
new file mode 100644
index 00000000000..f26abcb6802
--- /dev/null
+++ b/.github/workflows/autoupdate.yml
@@ -0,0 +1,33 @@
+name: Auto Update
+
+on:
+  # This will trigger on all pushes to all branches.
+#  push: {}
+  # Alternatively, you can only trigger if commits are pushed to certain branches, e.g.:
+  push:
+    branches:
+      - current
+  #     - unstable
+jobs:
+  autoupdate:
+    name: autoupdate
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: chinthakagodawita/autoupdate@v1.7.0
+        env:
+          GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
+          DRY_RUN: "false"
+          PR_FILTER: "labelled"
+          PR_LABELS: "auto update"
+          PR_READY_STATE: "all"
+         # EXCLUDED_LABELS: "do not merge"
+          MERGE_MSG: "This branch was auto-updated!"
+          RETRY_COUNT: "5"
+          RETRY_SLEEP: "300"
+          MERGE_CONFLICT_ACTION: "ignore"
+          
+      - run: echo 'We found merge conflicts when updating this PR. Please fix them as soon as you can.'
+        if: ${{ steps.autoupdate.outputs.conflicted }}
+
+      - run: echo 'Good news! No merge conflicts this time around.'
+        if: ${{ !steps.autoupdate.outputs.conflicted }}
diff --git a/.github/workflows/create_next_pr.yml b/.github/workflows/create_next_pr.yml
index becef0db654..185edbb1c8a 100644
--- a/.github/workflows/create_next_pr.yml
+++ b/.github/workflows/create_next_pr.yml
@@ -5,9 +5,10 @@ on:
 
 jobs:
   pull-request:
+    if: github.repository == 'dbt-labs/docs.getdbt.com'
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: pull-request
       uses: repo-sync/pull-request@v2
       with:
diff --git a/.github/workflows/cypress_tests.yml b/.github/workflows/cypress_tests.yml
index 1973e0ef43e..5e6cbbfedbb 100644
--- a/.github/workflows/cypress_tests.yml
+++ b/.github/workflows/cypress_tests.yml
@@ -12,10 +12,10 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: install node
-        uses: actions/setup-node@v2
+        uses: actions/setup-node@v3
         with:
           node-version: '16.13.1'
 
@@ -28,7 +28,7 @@ jobs:
           echo "PR_NUMBER=${PR_NUMBER}" >> ${GITHUB_ENV}
 
       - name: run cypress e2e 
-        uses: cypress-io/github-action@v4
+        uses: cypress-io/github-action@v5
         with:
           command: npx cypress run
           browser: chrome
@@ -40,8 +40,8 @@ jobs:
           CYPRESS_BASE_URL: https://deploy-preview-${{ env.PR_NUMBER }}--docs-getdbt-com.netlify.app
 
       - name: Screenshot artifact upload
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         if: failure()
         with:
           name: cypress-screenshots
-          path: cypress/screenshots
\ No newline at end of file
+          path: cypress/screenshots
diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml
new file mode 100644
index 00000000000..48615e60b9e
--- /dev/null
+++ b/.github/workflows/label.yml
@@ -0,0 +1,46 @@
+name: Add/Remove Labels
+
+on:
+  pull_request_target:
+    types: [opened]
+    
+jobs:
+  add_new_contributor_label:
+    if: github.event.action == 'opened'
+    runs-on: ubuntu-latest
+    steps:
+    - name: Add new contributor label
+      uses: actions/github-script@v6
+      with:
+        github-token: ${{ secrets.DOCS_SECRET }}
+        script: |
+            const creator = context.payload.sender.login;
+            const opts = github.rest.issues.listForRepo.endpoint.merge({
+              ...context.issue,
+              creator,
+              state: 'all',
+            });
+            
+            const issues = await github.paginate(opts);
+            
+            let isAlreadyContributor = false;
+            
+            for (const issue of issues) {
+              if (issue.number === context.issue.number) {
+                continue;
+              }
+              if (issue.pull_request && issue.user.login === creator) {
+                isAlreadyContributor = true;
+                break;
+              }
+            }
+            
+            if (!isAlreadyContributor) {
+              console.log('Adding label: new contributor');
+              await github.rest.issues.addLabels({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                labels: ['new contributor'],
+              });
+            } 
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 057208eda32..cc231cdcde3 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -1,7 +1,12 @@
+#
+# To use this workflow, you will need to set up a .github/labeler.yml
+# file with configuration.  For more information, see:
+# https://github.com/actions/labeler
+
 name: "Pull Request Labeler"
 on:
-- pull_request_target
-
+  pull_request_target:
+    types: [opened]
 jobs:
   triage:
     permissions:
diff --git a/.github/workflows/labelsizer.yml b/.github/workflows/labelsizer.yml
index d1ad5776597..2c0ee05e886 100644
--- a/.github/workflows/labelsizer.yml
+++ b/.github/workflows/labelsizer.yml
@@ -7,7 +7,7 @@ jobs:
     runs-on: ubuntu-latest
     name: Label the PR size
     steps:
-      - uses: codelytv/pr-size-labeler@v1.8.1
+      - uses: codelytv/pr-size-labeler@v1
         with:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           xs_label: 'size: x-small'
diff --git a/.github/workflows/links.yml b/.github/workflows/links.yml
new file mode 100644
index 00000000000..7e0dd52b60c
--- /dev/null
+++ b/.github/workflows/links.yml
@@ -0,0 +1,17 @@
+name: Docs Link Checker
+
+on:
+  schedule:
+    # Run every day at 12:00 UTC
+    - cron: '0 12 * * *'
+
+jobs:
+  markdown-link-check:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Link Checker
+        uses: lycheeverse/lychee-action@master
+        with:
+          args: --verbose --config .github/lychee.toml './website/**/*.md'
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000000..f49c3f1317b
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,20 @@
+name: Run ESLint Checks
+on: push
+
+jobs:
+  eslint-check:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    
+    - name: Install Node
+      uses: actions/setup-node@v3
+      with:
+        node-version: '18.12.0'
+
+    - name: Install Packages
+      run: cd website && npm ci
+      
+    - name: Run ESLint
+      run: cd website && npm run lintAll
diff --git a/.github/workflows/team-labeler.yml b/.github/workflows/team-labeler.yml
new file mode 100644
index 00000000000..81af84881bd
--- /dev/null
+++ b/.github/workflows/team-labeler.yml
@@ -0,0 +1,9 @@
+on: [pull_request_target]
+name: team-label
+jobs:
+  team-labeler:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: JulienKode/team-labeler-action@v1.1.0
+      with:
+        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.gitignore b/.gitignore
index 172fef2f8e3..74d338484aa 100755
--- a/.gitignore
+++ b/.gitignore
@@ -11,9 +11,14 @@ website/yarn.lock
 website/node_modules
 website/i18n/*
 
-# Local vs code 
+# IDE configs
 .vscode
+.idea
+
 # Local Netlify folder
 .netlify
 
-.vscode
+.eslintcache
+
+# Local Vercel folder
+.vercel
diff --git a/.husky/pre-commit b/.husky/pre-commit
new file mode 100755
index 00000000000..54372ea3e9a
--- /dev/null
+++ b/.husky/pre-commit
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+. "$(dirname -- "$0")/_/husky.sh"
+
+cd website && npx lint-staged
diff --git a/.vscode/settings.json b/.vscode/settings.json
deleted file mode 100644
index 1e7c2ea79e0..00000000000
--- a/.vscode/settings.json
+++ /dev/null
@@ -1,9 +0,0 @@
-{
-    "spellright.language": [
-        "en"
-    ],
-    "spellright.documentTypes": [
-        "latex",
-        "plaintext"
-    ]
-}
\ No newline at end of file
diff --git a/README.md b/README.md
index 83aa4f7e5bc..c749fedf95a 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Creating an inclusive and equitable environment for our documents is more import
 We welcome contributions from community members to this repo:
 - **Fixes**: When you notice an error, you can use the `Edit this page` button at the bottom of each page to suggest a change.
 - **New documentation**: If you contributed code in [dbt-core](https://github.com/dbt-labs/dbt-core), we encourage you to also write the docs here! Please reach out in the dbt community if you need help finding a place for these docs.
-- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new?assignees=&labels=content%2Cimprovement&template=improve-docs.yml) or [start a discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions) to propose ideas for a content area that requires attention.
+- **Major rewrites**: You can [file an issue](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to propose ideas for a content area that requires attention.
 
 You can use components documented in the [docusaurus library](https://v2.docusaurus.io/docs/markdown-features/).
 
@@ -42,7 +42,7 @@ You can add code snippets and other content in a tabbed view. To learn more abou
 
 # Running the Docs site locally
 
-You can click a link available in a netlify bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/):
+You can click a link available in a Vercel bot PR comment to see and review your changes rendered on a staging server. You are also able to see and review your proposed modifications locally on your computer. Our setup instructions use [homebrew](https://brew.sh/):
 
 ## Prerequisites
 
@@ -56,9 +56,9 @@ You can click a link available in a netlify bot PR comment to see and review you
 2. Clone this repo: `git clone https://github.com/dbt-labs/docs.getdbt.com.git`
 3. `cd` into the repo: `cd docs.getdbt.com`
 4. `cd` into the `website` subdirectory: `cd website`
-5. Install the required node packages: `npm install` (optional — install any updates)
-6. Build the website: `npm start`
-7. Before pushing your changes to a branch, check that all links work by using the `make build` script.
+5. Install the required node packages: `make install` or `npm install` (optional — install any updates)
+6. Build the website: `make run` or `npm start`
+7. Before pushing your changes to a branch, run `make build` or `npm run build` and check that all links work
 
 Advisory:
 - If you run into an `fatal error: 'vips/vips8' file not found` error when you run `npm install`, you may need to run `brew install vips`. Warning: this one will take a while -- go ahead and grab some coffee!
diff --git a/_headers b/_headers
deleted file mode 100644
index f6b636c5158..00000000000
--- a/_headers
+++ /dev/null
@@ -1,5 +0,0 @@
-/*
-  Strict-Transport-Security: max-age=63072000; includeSubDomains; preload
-  Content-Security-Policy: object-src 'none'; frame-ancestors 'none';
-  X-Content-Type-Options: nosniff
-  X-XSS-Protection: 1; mode=block
diff --git a/_redirects b/_redirects
deleted file mode 100644
index aab9b67f89c..00000000000
--- a/_redirects
+++ /dev/null
@@ -1,527 +0,0 @@
-/docs/running-a-dbt-project/running-dbt-in-production /docs/deploy/deployments 301
-/docs/running-a-dbt-project/profile /docs/get-started/connection-profiles 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications /docs/deploy/job-notifications 301
-/docs/dbt-cloud/using-dbt-cloud /docs/develop/develop-in-the-cloud 301
-/docs/dbt-cloud/january-2020-pricing-updates https://www.getdbt.com/pricing/  301
-/docs/dbt-cloud/dbt-cloud-enterprise https://www.getdbt.com/pricing/  301
-/docs/dbt-cloud/cloud-quickstart /docs/get-started/getting-started/set-up-dbt-cloud 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud /docs/develop/getting-started/getting-set-up 301
-/docs/building-a-dbt-project/archival /docs/build/snapshots 301
-/docs/about/license /community/resources/contributor-license-agreements 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository /docs/collaborate/git/managed-repository 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database /docs/get-started/connect-your-database 301
-/docs/dbt-cloud/release-notes /docs/dbt-versions/dbt-cloud-release-notes 301
-/docs/dbt-cloud/dbt-cloud-enterprise/audit-log /docs/collaborate/manage-access/audit-log 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth /docs/collaborate/manage-access/set-up-bigquery-oauth 301
-/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth /docs/collaborate/manage-access/set-up-snowflake-oauth 301
-/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta /docs/collaborate/manage-access/set-up-sso-okta 301
-/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory /docs/collaborate/manage-access/set-up-sso-azure-active-directory 301
-/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite /docs/collaborate/manage-access/set-up-sso-google-workspace 301
-/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-saml-2.0 /docs/collaborate/manage-access/set-up-sso-saml-2.0 301
-/docs/dbt-cloud/dbt-cloud-enterprise/sso-overview /docs/collaborate/manage-access/sso-overview 301
-/docs/dbt-cloud/access-control/enterprise-permissions /docs/collaborate/manage-access/enterprise-permissions 301
-/docs/dbt-cloud/access-control/self-service-permissions /docs/collaborate/manage-access/self-service-permissions 301
-/docs/dbt-cloud/access-control/cloud-seats-and-users /docs/collaborate/manage-access/seats-and-users 301
-/docs/dbt-cloud/access-control/access-control-overview /docs/collaborate/manage-access/about-access 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation /docs/collaborate/build-and-view-your-docs 301
-/docs/building-a-dbt-project/documentation /docs/collaborate/documentation 301
-/docs/building-a-dbt-project/managing-environments /docs/collaborate/environments 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url /docs/collaborate/git/import-a-project-by-git-url 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/authenticate-azure /docs/collaborate/git/authenticate-azure 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/setup-azure /docs/collaborate/git/setup-azure 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-azure-devops /docs/collaborate/git/connect-azure-devops 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-gitlab /docs/collaborate/git/connect-gitlab 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-installing-the-github-application /docs/collaborate/git/connect-github 301
-/docs/dbt-cloud/cloud-ide/handling-merge-conflicts /docs/collaborate/git/resolve-merge-conflicts 301
-/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide /docs/collaborate/cloud-build-and-view-your-docs 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories /docs/collaborate/git/pr-template 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration /docs/deploy/cloud-ci-job 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-dashboard-status-tiles /docs/deploy/dashboard-status-tiles 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness /docs/deploy/source-freshness 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-notifications /docs/deploy/job-notifications 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule /docs/deploy/job-triggers 301
-/docs/dbt-cloud/deployments/airgapped-deployment /docs/deploy/airgapped-deployment 301
-/docs/dbt-cloud/deployments/single-tenant-deployment /docs/deploy/single-tenant 301
-/docs/dbt-cloud/deployments/multi-tenant-deployment /docs/deploy/multi-tenant 301
-/docs/dbt-cloud/deployments/deployment-architecture /docs/deploy/architecture 301
-/docs/dbt-cloud/deployments/deployment-overview /docs/deploy/deployments 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name /docs/build/custom-target-names 301
-/docs/building-a-dbt-project/building-models/using-custom-aliases /docs/build/custom-aliases 301
-/docs/building-a-dbt-project/building-models/using-custom-databases /docs/build/custom-databases 301
-/docs/building-a-dbt-project/building-models/using-custom-schemas /docs/build/custom-schemas 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposures /docs/dbt-cloud-apis/metadata-schema-exposures 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposure /docs/dbt-cloud-apis/metadata-schema-exposure 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-tests /docs/dbt-cloud-apis/metadata-schema-tests 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-test /docs/dbt-cloud-apis/metadata-schema-test 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-snapshots /docs/dbt-cloud-apis/metadata-schema-snapshots 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seeds /docs/dbt-cloud-apis/metadata-schema-seeds 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seed /docs/dbt-cloud-apis/metadata-schema-seed 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-sources /docs/dbt-cloud-apis/metadata-schema-sources 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-source /docs/dbt-cloud-apis/metadata-schema-source 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metrics /docs/dbt-cloud-apis/metadata-schema-metrics 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metric /docs/dbt-cloud-apis/metadata-schema-metric 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-modelByEnv /docs/dbt-cloud-apis/metadata-schema-modelByEnv 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-models /docs/dbt-cloud-apis/metadata-schema-models 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-model /docs/dbt-cloud-apis/metadata-schema-model 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-querying /docs/dbt-cloud-apis/metadata-querying 301
-/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-overview /docs/dbt-cloud-apis/metadata-api 301
-/docs/dbt-cloud/dbt-cloud-api/admin-cloud-api /docs/dbt-cloud-apis/admin-cloud-api 301
-/docs/dbt-cloud/dbt-cloud-api/service-tokens /docs/dbt-cloud-apis/service-tokens 301
-/docs/dbt-cloud/dbt-cloud-api/user-tokens /docs/dbt-cloud-apis/user-tokens 301
-/docs/dbt-cloud/dbt-cloud-api/cloud-apis /docs/dbt-cloud-apis/overview 301
-/docs/building-a-dbt-project/hooks-operations /docs/build/hooks-operations 301
-/docs/building-a-dbt-project/analyses /docs/build/analyses 301
-/docs/building-a-dbt-project/package-management /docs/build/packages 301
-/docs/dbt-cloud/using-dbt-cloud/cloud-environment-variables /docs/build/environment-variables 301
-/docs/building-a-dbt-project/building-models/using-variables /docs/build/project-variables 301
-/docs/building-a-dbt-project/jinja-macros /docs/build/jinja-macros 301
-/docs/building-a-dbt-project/building-models/configuring-incremental-models /docs/build/incremental-models 301
-/docs/building-a-dbt-project/building-models/materializations /docs/build/materializations 301
-/docs/building-a-dbt-project/tests /docs/build/tests 301
-/docs/building-a-dbt-project/metrics /docs/build/metrics 301
-/docs/building-a-dbt-project/exposures /docs/build/exposures 301
-/docs/building-a-dbt-project/snapshots /docs/build/snapshots 301
-/docs/building-a-dbt-project/seeds /docs/build/seeds 301
-/docs/building-a-dbt-project/building-models /docs/build/sql-models 301
-/docs/building-a-dbt-project/using-sources /docs/build/sources 301
-/docs/building-a-dbt-project/projects /docs/build/projects 301
-/docs/building-a-dbt-project/building-models/python-models /docs/build/python-models 301
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions /docs/dbt-versions/upgrade-core-in-cloud 301
-/docs/core-versions /docs/dbt-versions/core 301
-/docs/dbt-cloud/cloud-dbt-cloud-support /docs/dbt-support 301
-/docs/about/viewpoint /community/resources/viewpoint 301
-/docs/viewpoint /community/resources/viewpoint 301
-/dbt-cli/configure-your-profile /docs/get-started/connection-profiles 301
-/docs/running-a-dbt-project/using-the-cli /docs/get-started/about-the-cli 301
-/dbt-cli/install/from-source /docs/get-started/source-install 301
-/dbt-cli/install/docker /docs/get-started/docker-install 301
-/dbt-cli/install/pip /docs/get-started/pip-install 301
-/dbt-cli/install/homebrew /docs/get-started/homebrew-install 301
-/dbt-cli/install/overview /docs/get-started/installation 301
-/docs/dbt-cloud/cloud-ide/the-dbt-ide /docs/get-started/dbt-cloud-features 301
-/useful*components https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/adding-page-components.md 302
-/guides/legacy/managing-environments /docs/building-a-dbt-project/managing-environments 302
-/docs/running-a-dbt-project/dbt-api /docs/introduction 301
-/img/docs/dbt-cloud/dbt-cloud-enterprise/icon.png https://www.getdbt.com/ui/img/dbt-icon.png 301!
-/dbt-cli/installation-guides/centos /docs/get-started/installation 301
-/dbt-cli/installation-guides/centos /docs/get-started/installation 301
-/dbt-cli/installation-guides/install-from-source /dbt-cli/install/from-source 302
-/dbt-cli/installation-guides/macos /docs/get-started/installation 302
-/dbt-cli/installation-guides/ubuntu-debian /docs/get-started/installation 302
-/dbt-cli/installation-guides/windows /docs/get-started/installation 302
-/dbt-cli/installation /docs/get-started/installation 302
-/dbt-jinja-functions /reference/dbt-jinja-functions 302
-/docs /docs/introduction 302
-/docs/adapter /docs/writing-code-in-dbt/jinja-context/adapter 302
-/docs/analyses /docs/building-a-dbt-project/analyses 302
-/docs/api-variable /docs/writing-code-in-dbt/api-variable 302
-/docs/archival /docs/building-a-dbt-project/archival 302
-/docs/artifacts /docs/dbt-cloud/using-dbt-cloud/artifacts 302
-/docs/bigquery-configs /reference/resource-configs/bigquery-configs 302
-/reference/resource-properties/docs /reference/resource-configs/docs 302
-/docs/building-a-dbt-project/building-models/bigquery-configs /reference/resource-configs/bigquery-configs 302
-/docs/building-a-dbt-project/building-models/configuring-models /reference/model-configs
-/docs/building-a-dbt-project/building-models/enable-and-disable-models /reference/resource-configs/enabled 302
-/docs/building-a-dbt-project/building-models/redshift-configs /reference/resource-configs/redshift-configs 302
-/docs/building-a-dbt-project/building-models/snowflake-configs /reference/resource-configs/snowflake-configs 302
-/docs/building-a-dbt-project/building-models/spark-configs /reference/resource-configs/spark-configs 302
-/docs/building-a-dbt-project/building-models/tags /reference/resource-configs/tags 302
-/docs/building-a-dbt-project/building-models/using-sql-headers /reference/resource-configs/sql_header 302
-/docs/building-a-dbt-project/dbt-projects /docs/building-a-dbt-project/projects 302
-/docs/building-a-dbt-project/dbt-projects/configuring-query-comments /reference/project-configs/query-comment 302
-/docs/building-a-dbt-project/dbt-projects/configuring-quoting /reference/project-configs/quoting 302
-/docs/building-a-dbt-project/dbt-projects/creating-a-project /docs/building-a-dbt-project/projects#creating-a-dbt-project 302
-/docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions /reference/project-configs/require-dbt-version 302
-/docs/building-a-dbt-project/dbt-projects/use-an-existing-project /docs/building-a-dbt-project/projects#using-an-existing-project 302
-/docs/building-a-dbt-project/hooks /docs/building-a-dbt-project/hooks-operations 302
-/docs/building-a-dbt-project/testing-and-documentation /docs/building-a-dbt-project/tests 302
-/docs/building-a-dbt-project/testing-and-documentation/documentation /docs/building-a-dbt-project/testing-and-documentation/documentation 302
-/docs/building-a-dbt-project/testing-and-documentation/documentation-website /docs/building-a-dbt-project/testing-and-documentation/documentation 302
-/docs/building-a-dbt-project/testing-and-documentation/schemayml-files /reference/declaring-properties 302
-/docs/building-a-dbt-project/testing-and-documentation/testing /docs/building-a-dbt-project/tests 302
-/docs/building-a-dbt-project/using-operations /docs/building-a-dbt-project/hooks-operations 302
-/docs/building-a-new-adapter /docs/contributing/building-a-new-adapter 302
-/docs/building-models /docs/building-a-dbt-project/building-models 302
-/docs/building-packages /guides/legacy/building-packages 302
-/docs/centos /dbt-cli/installation 302
-/docs/clean /reference/commands/clean 302
-/docs/cloud-choosing-a-dbt-version /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version 302
-/docs/cloud-configuring-dbt-cloud /docs/dbt-cloud/cloud-configuring-dbt-cloud 302
-/docs/cloud-enabling-continuous-integration-with-github /docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github 302
-/docs/cloud-generating-documentation /docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation 302
-/docs/cloud-import-a-project-by-git-url /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url 302
-/docs/cloud-installing-the-github-application /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-installing-the-github-application 302
-/docs/cloud-managing-permissions /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-managing-permissions 302
-/docs/cloud-overview /docs/dbt-cloud/cloud-overview 302
-/docs/cloud-quickstart /docs/dbt-cloud/cloud-quickstart 302
-/docs/cloud-seats-and-users /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-seats-and-users 302
-/docs/cloud-setting-a-custom-target-name /docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name 302
-/docs/cloud-snapshotting-source-freshness /docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness 302
-/docs/cloud-supported-dbt-versions /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version 302
-/docs/cloud-using-a-custom-cron-schedule /docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule 302
-/docs/cloud-using-a-managed-repository /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository 302
-/docs/cmd-docs /reference/commands/cmd-docs 302
-/docs/command-line-interface /reference/dbt-commands 302
-/docs/compile /reference/commands/compile 302
-/docs/config /docs/writing-code-in-dbt/jinja-context/config 302
-/docs/configure-your-profile /dbt-cli/configure-your-profile 302
-/docs/configuring-incremental-models /docs/building-a-dbt-project/building-models/configuring-incremental-models 302
-/docs/configuring-models /reference/model-configs 302
-/docs/configuring-query-comments /docs/building-a-dbt-project/dbt-projects/configuring-query-comments 302
-/docs/configuring-quoting /docs/building-a-dbt-project/dbt-projects/configuring-quoting 302
-/docs/configuring-resources-from-the-project-file /docs/building-a-dbt-project/dbt-projects/configuring-resources-from-the-project-file 302
-/docs/connecting-your-database /docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database 302
-/docs/contributor-license-agreements /docs/contributing/contributor-license-agreements 302
-/docs/creating-a-project /docs/building-a-dbt-project/dbt-projects/creating-a-project 302
-/docs/creating-new-materializations /guides/legacy/creating-new-materializations 302
-/docs/custom-schema-tests /guides/legacy/writing-custom-generic-tests 302
-/docs/dbt-api /docs/running-a-dbt-project/dbt-api 302
-/docs/dbt-cloud-enterprise /docs/dbt-cloud/dbt-cloud-enterprise 302
-/docs/dbt-cloud/cloud-configuring-repositories /docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories 302
-/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version /docs/dbt-versions/upgrade-core-in-cloud 301
-/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions/ /docs/dbt-cloud/access-control/enterprise-permissions 302
-/docs/dbt-cloud/on-premises/architecture /dbt-cloud/on-premises/dependencies 302
-/docs/dbt-projects /docs/building-a-dbt-project/dbt-projects 302
-/docs/dbt_projectyml-file /docs/building-a-dbt-project/dbt-projects/dbt_projectyml-file 302
-/docs/debug /reference/commands/debug 302
-/docs/debug-method /docs/writing-code-in-dbt/jinja-context/debug-method 302
-/docs/deps /reference/commands/deps 302
-/docs/doc /docs/writing-code-in-dbt/jinja-context/doc 302
-/docs/documentation /docs/building-a-dbt-project/documentation 302
-/docs/documentation-website /docs/building-a-dbt-project/documentation 302
-/docs/dont-nest-your-curlies /docs/building-a-dbt-project/dont-nest-your-curlies 302
-/docs/enable-and-disable-models /reference/resource-configs/enabled 302
-/docs/enterprise-permissions /docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions 302
-/docs/env_var /docs/writing-code-in-dbt/jinja-context/env_var 302
-/docs/exceptions /docs/writing-code-in-dbt/jinja-context/exceptions 302
-/docs/execute /docs/writing-code-in-dbt/jinja-context/execute 302
-/docs/exit-codes /reference/exit-codes 302
-/docs/flags /docs/writing-code-in-dbt/jinja-context/flags 302
-/docs/fromjson /docs/writing-code-in-dbt/jinja-context/fromjson 302
-/docs/getting-started-with-jinja /docs/building-a-dbt-project/jinja-macros 302
-/docs/global-cli-flags /reference/global-cli-flags 302
-/docs/graph /docs/writing-code-in-dbt/jinja-context/graph 302
-/docs/guides/building-packages /guides/legacy/building-packages 302
-/docs/guides/creating-new-materializations /guides/legacy/creating-new-materializations 302
-/docs/guides/debugging-errors /guides/legacy/debugging-errors 302
-/docs/guides/debugging-schema-names /guides/legacy/debugging-schema-names 302
-/docs/guides/getting-help /guides/legacy/getting-help 302
-/docs/guides/managing-environments /guides/legacy/managing-environments 302
-/docs/guides/navigating-the-docs /guides/legacy/navigating-the-docs 302
-/docs/guides/understanding-state /guides/legacy/understanding-state 302
-/docs/guides/videos /guides/legacy/videos 302
-/docs/guides/writing-custom-generic-tests /guides/legacy/writing-custom-generic-tests 302
-/docs/guides/writing-custom-schema-tests /guides/legacy/writing-custom-generic-tests 302
-/docs/guides/best-practices#choose-your-materializations-wisely /guides/legacy/best-practices#choose-your-materializations-wisely 302
-/docs/guides/best-practices#version-control-your-dbt-project /guides/legacy/best-practices#version-control-your-dbt-project 302
-/docs/best-practices /guides/legacy/best-practices 302
-/docs/guides/best-practices /guides/best-practices 302
-/docs/hooks /docs/building-a-dbt-project/hooks-operations 302
-/docs/init /reference/commands/init 302
-/docs/install-from-source /dbt-cli/installation 302
-/docs/installation /dbt-cli/installation 302
-/docs/invocation_id /docs/writing-code-in-dbt/jinja-context/invocation_id 302
-/docs/jinja-context /docs/writing-code-in-dbt/jinja-context 302
-/docs/license /docs/about/license 302
-/docs/list /reference/commands/list 302
-/docs/log /docs/writing-code-in-dbt/jinja-context/log 302
-/docs/macos /dbt-cli/installation 302
-/docs/macros /docs/building-a-dbt-project/macros 302
-/docs/managing-environments /guides/legacy/managing-environments 302
-/docs/materializations /docs/building-a-dbt-project/building-models/materializations 302
-/docs/model-selection-syntax /reference/node-selection/syntax 302
-/docs/modules /docs/writing-code-in-dbt/jinja-context/modules 302
-/docs/on-run-end-context /docs/writing-code-in-dbt/jinja-context/on-run-end-context 302
-/docs/overview /docs/introduction 302
-/docs/package-management /docs/building-a-dbt-project/package-management 302
-/docs/profile-bigquery /reference/warehouse-profiles/bigquery-profile 302
-/docs/profile-mssql /reference/warehouse-profiles/mssql-profile 302
-/docs/profile-postgres /reference/warehouse-profiles/postgres-profile 302
-/docs/profile-presto /reference/warehouse-profiles/presto-profile 302
-/docs/profile-redshift /reference/warehouse-profiles/redshift-profile 302
-/docs/profile-snowflake /reference/warehouse-profiles/snowflake-profile 302
-/docs/profile-spark /reference/warehouse-profiles/spark-profile 302
-/docs/redshift-configs /reference/resource-configs/redshift-configs 302
-/docs/spark-configs /reference/resource-configs/spark-configs 302
-/docs/redshift-v2 /reference/warehouse-profiles/redshift-profile 302
-/docs/ref /docs/writing-code-in-dbt/jinja-context/ref 302
-/docs/requiring-specific-dbt-versions /docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions 302
-/docs/return /docs/writing-code-in-dbt/jinja-context/return 302
-/docs/rpc /reference/commands/rpc 302
-/docs/run /reference/commands/run 302
-/docs/run-operation /reference/commands/run-operation 302
-/docs/run_query /docs/writing-code-in-dbt/jinja-context/run_query 302
-/docs/run_started_at /docs/writing-code-in-dbt/jinja-context/run_started_at 302
-/docs/running-a-dbt-project/command-line-interface /reference/dbt-commands 302
-/docs/running-a-dbt-project/command-line-interface/clean /reference/commands/clean 302
-/docs/running-a-dbt-project/command-line-interface/cmd-docs /reference/commands/cmd-docs 302
-/docs/running-a-dbt-project/command-line-interface/compile /reference/commands/compile 302
-/docs/running-a-dbt-project/command-line-interface/debug /reference/commands/debug 302
-/docs/running-a-dbt-project/command-line-interface/deps /reference/commands/deps 302
-/docs/running-a-dbt-project/command-line-interface/exit-codes /reference/exit-codes 302
-/docs/running-a-dbt-project/command-line-interface/global-cli-flags /reference/global-cli-flags 302
-/docs/running-a-dbt-project/command-line-interface/init /reference/commands/init 302
-/docs/running-a-dbt-project/command-line-interface/list /reference/commands/list 302
-/docs/running-a-dbt-project/command-line-interface/model-selection-syntax /reference/model-selection-syntax 302
-/docs/running-a-dbt-project/command-line-interface/rpc /reference/commands/rpc 302
-/docs/running-a-dbt-project/command-line-interface/run /reference/commands/run 302
-/docs/running-a-dbt-project/command-line-interface/run-operation /reference/commands/run-operation 302
-/docs/running-a-dbt-project/command-line-interface/seed /reference/commands/seed 302
-/docs/running-a-dbt-project/command-line-interface/snapshot /reference/commands/snapshot 302
-/docs/running-a-dbt-project/command-line-interface/source /reference/commands/source 302
-/docs/running-a-dbt-project/command-line-interface/test /reference/commands/test 302
-/docs/running-a-dbt-project/command-line-interface/version /reference/global-cli-flags#version 302
-/docs/running-a-dbt-project/using-the-command-line-interface /docs/running-a-dbt-project/using-the-cli 302
-/docs/running-a-dbt-project/using-the-command-line-interface/centos /dbt-cli/installation-guides/centos 302
-/docs/running-a-dbt-project/using-the-command-line-interface/configure-your-profile /dbt-cli/configure-your-profile 302
-/docs/running-a-dbt-project/using-the-command-line-interface/install-from-source /dbt-cli/installation-guides/install-from-source 302
-/docs/running-a-dbt-project/using-the-command-line-interface/installation /dbt-cli/installation 302
-/docs/running-a-dbt-project/using-the-command-line-interface/macos /dbt-cli/installation-guides/macos 302
-/docs/running-a-dbt-project/using-the-command-line-interface/ubuntu-debian /dbt-cli/installation-guides/ubuntu-debian 302
-/docs/running-a-dbt-project/using-the-command-line-interface/windows /dbt-cli/installation-guides/windows 302
-/docs/running-dbt-in-production /docs/running-a-dbt-project/running-dbt-in-production 302
-/docs/schema /docs/writing-code-in-dbt/jinja-context/schema 302
-/docs/schemas /docs/writing-code-in-dbt/jinja-context/schemas 302
-/docs/schemayml-files /reference/declaring-properties 302
-/docs/seed /reference/commands/seed 302
-/docs/seeds /docs/building-a-dbt-project/seeds 302
-/docs/setting-up-enterprise-sso-with-azure-active-directory /docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory 302
-/docs/setting-up-snowflake-sso /docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth 302
-/docs/setting-up-sso-with-google-gsuite /docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite 302
-/docs/setting-up-sso-with-okta /docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta 302
-/docs/slack-rules-of-the-road /docs/contributing/slack-rules-of-the-road 302
-/docs/snapshot /reference/commands/snapshot 302
-/docs/snapshots /docs/building-a-dbt-project/snapshots 302
-/docs/snowflake-configs /reference/resource-configs/snowflake-configs 302
-/docs/source /reference/commands/source 302
-/docs/statement-blocks /docs/writing-code-in-dbt/jinja-context/statement-blocks 302
-/docs/supported-databases/profile-bigquery /reference/bigquery-profile 302
-/docs/supported-databases/profile-mssql /reference/mssql-profile 302
-/docs/supported-databases/profile-postgres /reference/postgres-profile 302
-/docs/supported-databases/profile-presto /reference/presto-profile 302
-/docs/supported-databases/profile-redshift /reference/redshift-profile 302
-/docs/supported-databases/profile-snowflake /reference/snowflake-profile 302
-/docs/supported-databases/profile-spark /reference/spark-profile 302
-/docs/tags /reference/resource-configs/tags 302
-/docs/target /docs/writing-code-in-dbt/jinja-context/target 302
-/docs/test /reference/commands/test 302
-/docs/testing /docs/building-a-dbt-project/tests 302
-/docs/testing-and-documentation /docs/building-a-dbt-project/tests 302
-/docs/the-dbt-ide /docs/dbt-cloud/cloud-ide/the-dbt-ide 302
-/docs/this /docs/writing-code-in-dbt/jinja-context/this 302
-/docs/tojson /docs/writing-code-in-dbt/jinja-context/tojson 302
-/docs/ubuntu-debian /dbt-cli/installation 302
-/docs/use-an-existing-project /docs/building-a-dbt-project/dbt-projects/use-an-existing-project 302
-/docs/using-custom-aliases /docs/building-a-dbt-project/building-models/using-custom-aliases 302
-/docs/using-custom-database /docs/building-a-dbt-project/building-models/using-custom-databases 302
-/docs/using-custom-schemas /docs/building-a-dbt-project/building-models/using-custom-schemas 302
-/docs/using-dbt-cloud /docs/dbt-cloud/using-dbt-cloud 302
-/docs/using-jinja /guides/getting-started/learning-more/using-jinja 302
-/docs/using-operations /docs/building-a-dbt-project/hooks-operations 302
-/docs/using-sources /docs/building-a-dbt-project/using-sources 302
-/docs/using-sql-headers /reference/resource-configs/sql_header 302
-/docs/using-the-command-line-interface /docs/running-a-dbt-project/using-the-cli 302
-/docs/using-the-dbt-ide /docs/running-a-dbt-project/using-the-dbt-ide 302
-/docs/using-variables /docs/building-a-dbt-project/building-models/using-variables 302
-/docs/var /docs/writing-code-in-dbt/jinja-context/var 302
-/docs/version /reference/global-cli-flags#version 302
-/docs/videos /guides/legacy/videos 302
-/docs/windows /dbt-cli/installation 302
-/docs/writing-code-in-dbt/class-reference /reference/dbt-classes 302
-/docs/writing-code-in-dbt/extending-dbts-programming-environment/creating-new-materializations /guides/legacy/creating-new-materializations 302
-/docs/writing-code-in-dbt/extending-dbts-programming-environment/custom-schema-tests /guides/legacy/writing-custom-schema-tests 302
-/docs/writing-code-in-dbt/getting-started-with-jinja /docs/building-a-dbt-project/jinja-macros 302
-/docs/writing-code-in-dbt/jinja-context/adapter /reference/dbt-jinja-functions/adapter 302
-/docs/writing-code-in-dbt/jinja-context/as_text /reference/dbt-jinja-functions/as_text 302
-/docs/writing-code-in-dbt/jinja-context/builtins /reference/dbt-jinja-functions/builtins 302
-/docs/writing-code-in-dbt/jinja-context/config /reference/dbt-jinja-functions/config 302
-/docs/writing-code-in-dbt/jinja-context/dbt-project-yml-context /reference/dbt-jinja-functions/dbt-project-yml-context 302
-/docs/writing-code-in-dbt/jinja-context/dbt_version /reference/dbt-jinja-functions/dbt_version 302
-/docs/writing-code-in-dbt/jinja-context/debug-method /reference/dbt-jinja-functions/debug-method 302
-/docs/writing-code-in-dbt/jinja-context/doc /reference/dbt-jinja-functions/doc 302
-/docs/writing-code-in-dbt/jinja-context/env_var /reference/dbt-jinja-functions/env_var 302
-/docs/writing-code-in-dbt/jinja-context/exceptions /reference/dbt-jinja-functions/exceptions 302
-/docs/writing-code-in-dbt/jinja-context/execute /reference/dbt-jinja-functions/execute 302
-/docs/writing-code-in-dbt/jinja-context/flags /reference/dbt-jinja-functions/flags 302
-/docs/writing-code-in-dbt/jinja-context/fromjson /reference/dbt-jinja-functions/fromjson 302
-/docs/writing-code-in-dbt/jinja-context/fromyaml /reference/dbt-jinja-functions/fromyaml 302
-/docs/writing-code-in-dbt/jinja-context/graph /reference/dbt-jinja-functions/graph 302
-/docs/writing-code-in-dbt/jinja-context/invocation_id /reference/dbt-jinja-functions/invocation_id 302
-/docs/writing-code-in-dbt/jinja-context/log /reference/dbt-jinja-functions/log 302
-/docs/writing-code-in-dbt/jinja-context/modules /reference/dbt-jinja-functions/modules 302
-/docs/writing-code-in-dbt/jinja-context/on-run-end-context /reference/dbt-jinja-functions/on-run-end-context 302
-/docs/writing-code-in-dbt/jinja-context/profiles-yml-context /reference/dbt-jinja-functions/profiles-yml-context 302
-/docs/writing-code-in-dbt/jinja-context/project_name /reference/dbt-jinja-functions/project_name 302
-/docs/writing-code-in-dbt/jinja-context/ref /reference/dbt-jinja-functions/ref 302
-/docs/writing-code-in-dbt/jinja-context/return /reference/dbt-jinja-functions/return 302
-/docs/writing-code-in-dbt/jinja-context/run_query /reference/dbt-jinja-functions/run_query 302
-/docs/writing-code-in-dbt/jinja-context/run_started_at /reference/dbt-jinja-functions/run_started_at 302
-/docs/writing-code-in-dbt/jinja-context/schema /reference/dbt-jinja-functions/schema 302
-/docs/writing-code-in-dbt/jinja-context/schemas /reference/dbt-jinja-functions/schemas 302
-/docs/writing-code-in-dbt/jinja-context/source /reference/dbt-jinja-functions/source 302
-/docs/writing-code-in-dbt/jinja-context/statement-blocks /reference/dbt-jinja-functions/statement-blocks 302
-/docs/writing-code-in-dbt/jinja-context/target /reference/dbt-jinja-functions/target 302
-/docs/writing-code-in-dbt/jinja-context/this /reference/dbt-jinja-functions/this 302
-/docs/writing-code-in-dbt/jinja-context/tojson /reference/dbt-jinja-functions/tojson 302
-/docs/writing-code-in-dbt/jinja-context/toyaml /reference/dbt-jinja-functions/toyaml 302
-/docs/writing-code-in-dbt/jinja-context/var /reference/dbt-jinja-functions/var 302
-/docs/writing-code-in-dbt/macros /docs/building-a-dbt-project/jinja-macros 302
-/docs/writing-code-in-dbt/using-jinja /guides/getting-started/learning-more/using-jinja 302
-/faqs/getting-help/ /guides/legacy/getting-help 302
-/migration-guide/upgrading-to-0-17-0 /guides/migration/versions 302
-/migration-guide/upgrading-to-0-18-0 /guides/migration/versions 302
-/reference/accounts /dbt-cloud/api 302
-/reference/api /dbt-cloud/api 302
-/reference/connections /dbt-cloud/api 302
-/reference/data-test-configs /reference/test-configs 302
-/reference/declaring-properties /reference/configs-and-properties 302
-/reference/dbt-artifacts /reference/artifacts/dbt-artifacts 302
-/reference/environments /dbt-cloud/api 302
-/reference/events /reference/events-logging 302
-/reference/jobs /dbt-cloud/api 302
-/reference/model-selection-syntax /reference/node-selection/syntax 302
-/reference/project-configs/on-run-end /reference/project-configs/on-run-start-on-run-end 302
-/reference/project-configs/on-run-start /reference/project-configs/on-run-start-on-run-end 302
-/reference/repositories /dbt-cloud/api 302
-/reference/resource-configs/post-hook /reference/resource-configs/pre-hook-post-hook 302
-/reference/resource-configs/pre-hook /reference/resource-configs/pre-hook-post-hook 302
-/reference/resource-properties/tags /reference/resource-configs/tags 302
-/reference/runs /dbt-cloud/api 302
-/reference/using-the-dbt-cloud-api /dbt-cloud/api 302
-https://tutorial.getdbt.com/* https://docs.getdbt.com/:splat 301!
-/reference/model-selection-syntax/#test-selection-examples /reference/node-selection/test-selection-examples 302
-/docs/building-a-dbt-project/building-models/using-custom-database /docs/building-a-dbt-project/building-models/using-custom-databases 302
-/dbt-cloud/api /dbt-cloud/api-v2 302
-/reference/project-configs/source-paths /reference/project-configs/model-paths 302
-/reference/project-configs/data-paths /reference/project-configs/seed-paths 302
-/reference/project-configs/modules-paths /reference/project-configs/packages-install-path 302
-/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications /docs/dbt-cloud/using-dbt-cloud/cloud-notifications 302
-/reference/warehouse-profiles/presto-profile /reference/profiles.yml 302
-/setting-up /guides/getting-started/getting-set-up/setting-up-bigquery 302
-/tutorial/setting-up /guides/getting-started 302
-/tutorial/test-and-document-your-project /guides/getting-started/building-your-first-project/test-and-document-your-project 302
-/tutorial/build-your-first-models /guides/getting-started/building-your-first-project/build-your-first-models 302
-/tutorial/deploy-your-project /guides/getting-started/building-your-first-project/schedule-a-job 302
-/tutorial/using-jinja /guides/getting-started/learning-more/using-jinja 302
-/tutorial/refactoring-legacy-sql /guides/getting-started/learning-more/refactoring-legacy-sql 302
-/tutorial/2b-create-a-project-dbt-cli.md /guides/getting-started/learning-more/getting-started-dbt-core 302
-/tutorial/create-a-project-dbt-cli /guides/getting-started/learning-more/getting-started-dbt-core 302
-/tutorial/2a-create-a-project-dbt-cloud.md /guides/getting-started 302
-/tutorial/create-a-project-dbt-cloud /guides/getting-started 302
-/tutorial/getting-started /guides/getting-started 302
-/docs/dbt-cloud/cloud-changelog /docs/dbt-cloud/release-notes 302
-/faqs/all /docs/faqs 301!
-/faqs/_ /docs/faqs/:splat 301
-/tutorial/learning-more/_ /guides/getting-started/learning-more/:splat 301
-/tutorial/getting-set-up/\_ /guides/getting-started/getting-set-up/:splat 301
-/tutorial/building-your-first-project/\* /guides/getting-started/building-your-first-project/:splat 301
-/tutorial/refactoring-legacy-sql /guides/getting-started/learning-more/refactoring-legacy-sql 302
-/blog/change-data-capture-metrics /blog/change-data-capture 301
-/blog/model-timing-tab /blog/how-we-shaved-90-minutes-off-model 301
-
-# supported data platforms page
-
-/docs/profile /docs/supported-data-platforms 302
-/docs/available-adapters /docs/supported-data-platforms 302
-/docs/supported-databases /docs/supported-data-platforms 302
-
-# migration and legacy guides
-
-/docs/guides/migration-guide/upgrading-to-0-14-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-15-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-16-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-17-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-18-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-19-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-from-0-10-to-0-11 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-014 /guides/migration/versions 302
-/docs/upgrading-to-014 /guides/migration/versions 302
-/docs/upgrading-to-0-14-1 /guides/migration/versions 302
-/docs/upgrading-to-0-16-0 /guides/migration/versions 302
-/docs/guides/migration-guide/upgrading-to-0-20-0 /guides/migration/versions/upgrading-to-v0.20 302
-/docs/guides/migration-guide/upgrading-to-0-21-0 /guides/migration/versions/upgrading-to-v0.21 302
-/docs/guides/migration-guide/upgrading-to-1-0-0 /guides/migration/versions/upgrading-to-v1.0 302
-/docs/guides/migration-guide/upgrading-to-v1.0 /guides/migration/versions/upgrading-to-v1.0 302
-/docs/guides/getting-help /guides/legacy/getting-help 302
-/docs/guides/migration-guide/_ /guides/migration/versions/:splat 301!
-/docs/guides/_ /guides/legacy/:splat 301!
-docs/contributing/building-a-new-adapter /docs/contributing/adapter-development/3-building-a-new-adapter 302
-docs/contributing/testing-a-new-adapter /docs/contributing/adapter-development/4-testing-a-new-adapter 302
-docs/contributing/documenting-a-new-adapter /docs/contributing/adapter-development/5-documenting-a-new-adapter 302
-
-/docs/dbt-cloud/using-dbt-cloud/cloud-metrics-layer        /docs/use-dbt-semantic-layer/dbt-semantic-layer 301!
-/docs/building-a-new-adapter /docs/contributing/adapter-development/3-building-a-new-adapter 301!
-/reference/warehouse-profiles/impala-profile /reference/warehouse-setups/impala-setup 302
-/reference/warehouse-profiles/exasol-profile /reference/warehouse-setups/exasol-setup 302
-/reference/warehouse-profiles/layer-profile /reference/warehouse-setups/layer-setup 302
-/reference/warehouse-profiles/postgres-profile /reference/warehouse-setups/postgres-setup 302
-/reference/warehouse-profiles/greenplum-profile /reference/warehouse-setups/greenplum-setup 302
-/reference/warehouse-profiles/alloydb-profile /reference/warehouse-setups/alloydb-setup 302
-/reference/warehouse-profiles/azuresynapse-profile /reference/warehouse-setups/azuresynapse-setup 302
-/reference/warehouse-profiles/snowflake-profile /reference/warehouse-setups/snowflake-setup 302
-/reference/warehouse-profiles/rockset-profile /reference/warehouse-setups/rockset-setup 302
-/reference/warehouse-profiles/trino-profile /reference/warehouse-setups/trino-setup 302
-/reference/warehouse-profiles/glue-profile /reference/warehouse-setups/glue-setup 302
-/reference/warehouse-profiles/duckdb-profile /reference/warehouse-setups/duckdb-setup 302
-/reference/warehouse-profiles/vertica-profile /reference/warehouse-setups/vertica-setup 302
-/reference/warehouse-profiles/clickhouse-profile /reference/warehouse-setups/clickhouse-setup 302
-/reference/warehouse-profiles/athena-profile /reference/warehouse-setups/athena-setup 302
-/reference/warehouse-profiles/iomete-profile /reference/warehouse-setups/iomete-setup 302
-/reference/warehouse-profiles/mssql-profile /reference/warehouse-setups/mssql-setup 302
-/reference/warehouse-profiles/tidb-profile /reference/warehouse-setups/tidb-setup 302
-/reference/warehouse-profiles/materialize-profile /reference/warehouse-setups/materialize-setup 302
-/reference/warehouse-profiles/redshift-profile /reference/warehouse-setups/redshift-setup 302
-/reference/warehouse-profiles/databricks-profile /reference/warehouse-setups/databricks-setup 302
-/reference/warehouse-profiles/bigquery-profile /reference/warehouse-setups/bigquery-setup 302
-/reference/warehouse-profiles/dremio-profile /reference/warehouse-setups/dremio-setup 302
-/reference/warehouse-profiles/oracle-profile /reference/warehouse-setups/oracle-setup 302
-/reference/warehouse-profiles/teradata-profile /reference/warehouse-setups/teradata-setup 302
-/reference/warehouse-profiles/singlestore-profile /reference/warehouse-setups/singlestore-setup 302
-/reference/warehouse-profiles/sqlite-profile /reference/warehouse-setups/sqlite-setup 302
-/reference/warehouse-profiles/spark-profile /reference/warehouse-setups/spark-setup 302
-/reference/warehouse-profiles/mindsdb-profile /reference/warehouse-setups/mindsdb-setup 302
-/reference/warehouse-profiles/ibmdb2-profile /reference/warehouse-setups/ibmdb2-setup 302
-/reference/warehouse-profiles/firebolt-profile /reference/warehouse-setups/firebolt-setup 302
-/reference/warehouse-profiles/mysql-profile /reference/warehouse-setups/mysql-setup 302
-/reference/warehouse-profiles/hive-profile /reference/warehouse-setups/hive-setup 302
-/reference/using-sources /docs/build/sources 302
-
-# getting started guide
-/guides/getting-started /docs/get-started/getting-started/overview 301
-/guides/getting-started/building-your-first-project /docs/get-started/getting-started/building-your-first-project/build-your-first-models 301
-/guides/getting-started/building-your-first-project/build-your-first-models /docs/get-started/getting-started/building-your-first-project/build-your-first-models 301
-/guides/getting-started/building-your-first-project/schedule-a-job /docs/get-started/getting-started/building-your-first-project/schedule-a-job 301
-/guides/getting-started/building-your-first-project/test-and-document-your-project /docs/get-started/getting-started/building-your-first-project/test-and-document-your-project 301
-/guides/getting-started/create-a-project /docs/get-started/getting-started/create-a-project 301
-/guides/getting-started/getting-set-up /docs/get-started/getting-started/set-up-dbt-cloud 301
-/guides/getting-started/getting-set-up/setting-up-bigquery /docs/get-started/getting-started/getting-set-up/setting-up-bigquery 301
-/guides/getting-started/getting-set-up/setting-up-databricks /docs/get-started/getting-started/getting-set-up/setting-up-databricks 301
-/guides/getting-started/getting-set-up/setting-up-redshift /docs/get-started/getting-started/getting-set-up/setting-up-redshift 301
-/guides/getting-started/getting-set-up/setting-up-snowflake /docs/get-started/getting-started/getting-set-up/setting-up-snowflake 301
-/guides/getting-started/getting-started /docs/get-started/getting-started/set-up-dbt-cloud 301
-/guides/getting-started/learning-more /docs/get-started/getting-started-dbt-core 301
-/guides/getting-started/learning-more/getting-started-dbt-core /docs/get-started/getting-started-dbt-core 301
-/guides/getting-started/learning-more/refactoring-legacy-sql /docs/get-started/learning-more/refactoring-legacy-sql 301
-/guides/getting-started/learning-more/using-jinja /docs/get-started/learning-more/using-jinja 301
-
-# ide ia redirects
-/docs/dbt-cloud/cloud-ide/the-dbt-ide  /docs/getting-started/dbt-cloud-features 301!
-/docs/dbt-cloud/cloud-ide/handling-merge-conflicts  /docs/collaborate/git/resolve-merge-conflicts 301!
-/dbt-cloud/cloud-ide/viewing-docs-in-the-ide  /docs/getting-started/develop-in-the-cloud 301!
-/docs/dbt-cloud/cloud-ide/ide-beta  /docs/getting-started/develop-in-the-cloud 301!
-/docs/running-a-dbt-project/using-the-dbt-ide  /docs/getting-started/develop-in-the-cloud 301!
-/dbt-cloud/cloud-ide/the-ide-git-button  /docs/collaborate/git/version-control-basics 301!
-
-# Community docs
-/docs/contributing/long-lived-discussions-guidelines /community/resources/forum-guidelines 301
-/docs/guides/legacy/navigating-the-docs.md /community/contribute 301
-/community/writing-on-discourse/ /community/contributing/contributing-online-community 301
-/community/contributing/ /community/contribute 301
-/docs/contributing/contributor-license-agreements /community/resources/contributor-license-agreements 301
-/community/maintaining-a-channel /community/resources/maintaining-a-channel 301
-/docs/contributing/oss-expectations /community/resources/oss-expectations 301
-/docs/contributing/slack-rules-of-the-road /community/resources/slack-rules-of-the-road 301
-
-/blog/getting-started-with-the-dbt-semantic-layer /blog/understanding-the-components-of-the-dbt-semantic-layer 301!
-/docs/getting-started/develop-in-the-cloud#creating-a-development-environment  /docs/get-started/develop-in-the-cloud#set-up-and-access-the-cloud-ide 301
-
diff --git a/contributing/adding-page-components.md b/contributing/adding-page-components.md
index 5cbe8a6d471..751f7c1f6c1 100644
--- a/contributing/adding-page-components.md
+++ b/contributing/adding-page-components.md
@@ -1,6 +1,6 @@
 ## Using warehouse components
 
-You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page, "[Initialize your database](/docs/get-started/getting-started/getting-set-up/setting-up-databricks#initialize-your-dbt-project)."
+You can use the following components to provide code snippets for each supported warehouse. You can see a real-life example in the docs page [Initialize your project](/quickstarts/databricks?step=6).
 
 Identify code by labeling with the warehouse names:
 
@@ -62,7 +62,7 @@ Identify code and code files by labeling with the component they are describing:
 
         ```yml
         models:
-        [](resource-path):
+        [](/reference/resource-configs/resource-path):
 
 
         ```
@@ -77,7 +77,7 @@ Identify code and code files by labeling with the component they are describing:
 
         ```yml
         sources:
-        [](resource-path):
+        [](/reference/resource-configs/resource-path):
 
 
         ```
@@ -87,4 +87,21 @@ Identify code and code files by labeling with the component they are describing:
         
 
         
-```
\ No newline at end of file
+```
+
+### Link to tabbed content
+
+You can use the [queryString](https://docusaurus.io/docs/next/markdown-features/tabs?current-os=ios#query-string) prop in the `` tag. This allows you to share a link to a page with a pre-selected tab so that clicking on a tab creates a unique hyperlink for that tab. However, this feature doesn't provide an anchor link, which means the browser won't scroll to the tab. Additionally, you can define the search parameter name to use. If the tabs content is under a header, you can alternatively link to the header itself, instaed of the `queryString` prop.
+
+In the following example, clicking a tab adds a search parameter to the end of the URL: `?current-os=android or ?current-os=ios`.
+
+```
+
+  
+    Android
+  
+  
+    iOS
+  
+
+```
diff --git a/contributing/content-style-guide.md b/contributing/content-style-guide.md
index d59eb98e118..eaa090a00b6 100644
--- a/contributing/content-style-guide.md
+++ b/contributing/content-style-guide.md
@@ -5,6 +5,9 @@ Welcome to the content style guide for docs.getdbt.com! We aim to provide docs t
 This guide includes standards we want to emphasize, likely because we've made deliberate decisions about them. You can refer to [_The Microsoft Writing Style Guide_](https://docs.microsoft.com/en-us/style-guide/welcome/) and the [_Chicago Manual of Style_](https://www.chicagomanualofstyle.org/home.html) for those nagging questions like, "[Should I use an Em dash, En dash or hyphen?](https://docs.microsoft.com/en-us/style-guide/punctuation/dashes-hyphens/)"
 
 ### Table of Contents
+* [Folder Structure and TOC](#folder-structure-and-toc)
+* [Filenaming](#filenaming)
+* [Using Markdown](#using-markdown)
 * [Callouts](#callouts)
 * [Text formatting](#Text-formatting)
 * [UI elements](#UI-elements)
@@ -13,16 +16,75 @@ This guide includes standards we want to emphasize, likely because we've made de
 * [Oxford comma](#Oxford-comma)
 * [Lists](#Lists)
 * [Tables](#Tables)
-* [Word choice & terminology](#Word-choice-&-terminology)
+* [Cards](#Cards)
+* [Word choice & terminology](#word-choice--terminology)
 * [Links](#Links)
 * [Images](#Images)
 * [Talk to us](#Talk-to-us)
 
+## Folder Structure and TOC
+
+The folder structure for the [docs.getdbt.com](https://github.com/dbt-labs/docs.getdbt.com) repository is organized into several high-level categories under the main `website` folder such as `blog`, `cypress`, `docs`, `functions`, `plugins`, `snippets`, `src`, `static`, and so on.
+
+The left sidebar (leftbar) is managed in the `sidebar.js` file. You only need to edit the `sidebar.js` file when you are adding a new page or deleting an existing page. Editing the `sidebar.js` file can cause merge conflicts as it's updated often because content is being worked on daily. You will need to accept the changes from other contributors if you are committing a PR.
+
+Don't worry if you're not sure where in the leftbar a new topic belongs. Do your best and when you submit your PR, the dbt Labs Documentation team will edit it and help to find the right placement.
+
+The right sidebar (rightbar), also known as the page TOC, is created automatically for all the H2 and H3 headings on a page. For details, see [Headings and Table of contents](https://docusaurus.io/docs/markdown-features/toc) in Docusaurus. 
+
+## Filenaming
+
+If you're adding a new file in the [docs.getdbt.com](https://github.com/dbt-labs/docs.getdbt.com) repository, review the following file name guidance and examples: 
+
+- Keep the file name as short as possible since it becomes part of the URL and leave out any unnecessary words. Long URLs are hard to read and prone to errors.
+- Think about where you want to locate the file in the leftbar. This will help catch and avoid redundancy. Remain consistent with the surrounding leftbar items.
+- Be as descriptive as possible so readers have an idea as to what they’re about to read.
+
+| ✅ Use | ❌ Avoid|
+|:-------:|:-------:|
+|`/docs/cloud/about-cloud/architecture` | `/docs/deploy/how-we-think-about-architecture`|
+
+
+## Using Markdown
+
+docs.getdbt.com uses its own CSS, and Docusaurus supports its own specific Markdown syntax. Review the basic Markdown syntax [document](https://www.markdownguide.org/basic-syntax/) for the supported syntax elements. For custom syntaxes, review the following table: 
+
+| Element                                     | Syntax                                                |
+|---------------------------------------------|-------------------------------------------------------|
+| Link - external site                        | `[Title](https://www.example.com)`                    |
+| Link - topic in same folder                 | `[Title](/folder/file-name) without file extension`*   |
+| Link - topic in different folder            | `[Title](/folder/file-name) without file extension`*   |
+| Link - section in topic in same folder      | `[Title](/folder/file-name#section-name)`*             |
+| Link - section in topic in different folder | `[Title](/folder/file-name#section-name)`*            |
+| Image                                       | ``|
+
+*docs.getdbt.com uses specific folders when linking to topics or sections. A successful link syntax begins with one of the following folder paths:
+
+- `/docs` 
+- `/guides` 
+- `/references`
+
+**Example**
+
+:white_check_mark: `[Title](/guides/file-name)` 
+
+:x: `[Title](/blog/file-name)` 
+
 ## Callouts
 
 Callouts highlight important or high-value information that readers need to know. We want callouts to stand out, so we should keep their content to a minimum, avoiding general information, permissions, or prerequisites. Too much information can make it difficult to absorb. Imagine driving down one block with five stop signs!?!
 
-Use callouts sparingly for high-value information — avoid including general information, permissions, or prerequisites in callouts.
+If you add text after the first code, like this `:::note This is a note title`, it shows up as a title for the note.
+
+Callout formats include:
+
+| Types of callouts | Callout formats |
+| ---- | ------ |
+| Note callouts are used for notices| ```:::note``` 

```text```

```:::``` | +| Info callouts are used to highlight info |```:::info```

```text```

```:::``` | +| Tip callouts are used for tips |```:::tip```

```text```

```:::``` | +| Caution callouts are used for warnings/considerations |```:::caution```

```text```

```:::``` | + ## Text formatting You consider many elements when designing technical docs, and the way you format text can help you achieve a well-articulated design. With consistency of use, well-formatted text creates a single voice when there are multiple authors/contributors, increases the readability of the content, and further establishes an organization's branding. @@ -59,6 +121,8 @@ Use [code blocks](#code-blocks) for longer or more complex commands. Do _not_ us Keep lines in code samples to about 60 characters, to avoid requiring readers to scroll horizontally in the code block. Locate explanatory text before the code block, rather than using comments inside the code block. +You can look at the different [languages you can use in code blocks](https://markdown.land/markdown-code-block#markdown-code-block-language-list), which will change how the code example is formatted (like highlighting). For example, `yaml` or `shell` are commonly used in the dbt product documentation. + Within code blocks: * Avoid using markup before the command output. @@ -277,7 +341,119 @@ A table following an H3 heading: > | `-readable` | Print output in human readable format. |
  • `true`
  • `false`
| > | `-file` | Print output to file instead of stdout. | Name of the file. | +## Cards + +Use the ``: creates 2 column cards +- ``: creates 3 columns cards +- ``: creates 4 columns cards (use sparingly) +- ``: creates 5 columns cards (use sparingly) +- You can't create cards with 6 or more columns as that would provide users a poor experience. + +Refer to [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and [Quickstarts](/docs/quickstarts/overview) as examples. + +### Create cards + +To create cards in markdown, you need to: + +- Start off by using the appropriate `` for your use case +- Create a `` + +Refer to the following prop list for detailed explanation and examples: + +| Prop | Type | Info | Example | +| ---- | ---- | ---- | ------- | +| `title` | required | The title should be clear and explain an action the user should take or a product/feature. | `title: dbt Cloud IDE` +| `body` | required | The body contains the actionable or informative text for the user. You can include `` | + +The following is an example of a 4 card column: + +``` +
+ + + + + + + + + +
+``` + ## Word choice & terminology +Use active voice instead of passive. Active voice is clearer and more direct, making it easier to translate. + +✅ Use: The contributor writes the documentation. + +❌ Avoid: The documentation is written by contributors. + +### Active voice + +Use the active voice most of the time. Use the passive voice sparingly. + +- Passive voice: _Files are added by developers._ +- Active voice: _Developers add files._ + +Active voice provides the following advantages: + +- Active voice is generally shorter than passive voice. +- Active voice is easier for users to understand and often results in shorter content. +- Most readers mentally convert passive voice to active voice. Why subjects readers to extra processing time? By sticking to active voice, you enable readers to skip the preprocessor stage and go straight to compilation. +- Passive voice confuses your ideas and reports action indirectly. +- Some passive voice sentences omit an actor altogether, which forces the reader to guess the actor's identity. + + +Sometimes, using passive voice is appropriate. Make sure it’s an intentional choice that communicates the idea clearer than active voice would. For example, when the system is the actor, rather than a person. + + +✅ Use | ❌ Avoid +--- | ---| +(Active voice) Ask someone with access to dbt Cloud to transform the data. | This data transformation can be done by someone with access to dbt Cloud. | +(Passive voice — exceptions) The open-sourced rpc plugin is used by the Cloud IDE to recompile changes made in your project. | We are using the open-sourced rpc plugin for the Cloud IDE to recompile changes made in your project. | + +### Spelling + +In general, when the spelling of English words varies by locale — use the US spelling. For example: + +✅ Use | ❌ Avoid +-- | -- +standardize | standardise +license | licence +color | colour + + +Avoid regional idiomatic phrases as well. For example, a common saying amongst English speakers in India is "do the needful," but this phrase may be unrecognizable to English speakers from other regions. ### Abbreviations @@ -304,7 +480,6 @@ Some common Latin abbreviations and other words to use instead: | e.g. |
  • for example
  • like
|
  • Join both the dedicated #adapter-ecosystem channel in dbt Slack and the channel for your adapter's data store (for example, #db-sqlserver and #db-athena)
  • Using Jinja in SQL provides a way to use control structures (like `if` statements and `for` loops) in your queries
| | etc. |
  • and more
  • and so forth
|
  • A continuous integration environment running pull requests in GitHub, GitLab, and more
  • While reasonable defaults are provided for many such operations (like `create_schema`, `drop_schema`, `create_table`, and so forth), you might need to override one or more macros when building a new adapter
| - ### Prepositions Avoid ending a sentence with a preposition unless the rewritten sentence would sound awkward or too formal. @@ -315,7 +490,7 @@ Product names, trademarks, services, and tools should be written as proper nouns ### Terms to use or avoid -Use industry-specific terms and research new/improved terminology. Also refer to the Inclusive Language section of this style guide for inclusive and accessible language and style. +Use industry-specific terms and research new/improved terminology. Also refer to the Inclusive Language section of this style guide for inclusive and accessible language and style. **DO NOT** use jargon or language familiar to a small subset of readers or assume that your readers understand ALL technical terms. @@ -336,7 +511,42 @@ username | login ## Links -Links embedded in documentation are about trust. Users trust that we will lead them to sites or pages related to their reading content. In order to maintain that trust, it is important that links are transparent, up-to-date, and lead to legitimate resources. +Links embedded in documentation are about trust. Users trust that we will lead them to sites or pages related to their reading content. In order to maintain that trust, it's important that links are transparent, up-to-date, and lead to legitimate resources. + +### Internal links + +All internal links should use relative and not absolute paths. We construct these paths in relation to the content root, which is`[_docs.getdbt.com repository_/website/docs](https://github.com/dbt-labs/docs.getdbt.com/tree/current/website/docs)`. + +We require either _file_ paths relative to the content root (these include the file extension, such as `.md`) or _URL_ paths relative to the content root (these don't include `.md`). We avoid paths relative to the document (for example, one directory above a document `../LinkedDocument`) because they won't work during local development and testing, and moving a document won't break the links it contains. + +Markdown links in Docusaurus open in the same window rather than creating a new browser tab, but you can use HTML or full URLs to open a link in a new tab. + +The file or URL paths begin with: +- /docs/ +- /guides/ +- /reference/ +- /community/ + +Let's use the Regions & IP Addresses URL as an example: https://docs.getdbt.com/docs/cloud/about-cloud/regions-ip-addresses +If we need to reference this on another page, we can remove the domain entirely: + +`For more information about server availability, please refer to our [Regions & IP Addresses page](/docs/cloud/about-cloud/regions-ip-addresses)` + +The reader will see: + +For more information about server availability, please refer to our [Regions & IP Addresses page](/docs/cloud/about-cloud/regions-ip-addresses) + +You can link to a specific section of the doc with a `#` at the end of the path. Enter the section’s title after the `#`, with individual words separated by hyphens. Let's use the incremental models page, https://docs.getdbt.com/docs/build/incremental-models, as an example: + +`To better understand this model type, read our [incremental models page](/docs/build/incremental-models#understanding-incremental-models).` + +This will appear to the reader as follows: + +To better understand this model type, read our [incremental models page](/docs/build/incremental-models#understanding-incremental-models). + +When you click on the link, it automatically takes you to the section defined at the end of the path. If the path syntax is incorrect(or does not exist), the link will take the reader to the top of the page specified in the path. + +There are different methods for handling this based on page location (and other nuances), so please reference the [Docusaurus docs site](https://docusaurus.io/docs/markdown-features/links) for more detailed information. ### Link format @@ -347,6 +557,11 @@ Hyperlinks should be text only, please avoid image-based links. The text should :x: For more information, [_Click Here_](https://docs.getdbt.com/) ✅ For more information, visit the [_dbt Labs doc site_](https://docs.getdbt.com/). + + ✅ For more information, read the [_dbt Labs doc site_](https://docs.getdbt.com/). + + ✅ For more information, refer to the [_dbt Labs doc site_](https://docs.getdbt.com/). + ### Link destinations @@ -398,11 +613,16 @@ Both macOS and Windows include tools that allow you to capture and manipulate sc #### Screenshot guidelines -Once you have determined that a screenshot will add value to the document where words alone can't, refer to these guidelines for capturing the information: +Once you have determined that a screenshot will add value to the document where words alone can't, use the [Lightbox component](/contributing/lightbox) and the following guidelines to capture the information: * Use screenshots to highlight navigation, on-screen elements, and other noteworthy product visuals. * Avoid using screenshots to demonstrate inputs and outputs. All code snippets and sample results should be in the documents as text fields. +* Add images are under the `static` -> `img` folder. * Use concise filenames that are relevant to the content contained within. Enumerate them if they are part of a sequence. +* Use JPEG format, which renders a better quality and lossless compression. JPEG format has a white background and is accessible in light and dark mode. +* Add *title=""* for all images to write a concise title of the image. For accessibility, it's important to use succinct text that is clear and complete. + +For more information about image-formatting, review the [Lightbox guidance](/contributing/lightbox), and the following examples: :x: screenshot-august0822.jpg diff --git a/contributing/developer-blog.md b/contributing/developer-blog.md index 1ad3c271ed3..aa9d5b33131 100644 --- a/contributing/developer-blog.md +++ b/contributing/developer-blog.md @@ -6,6 +6,8 @@ The dbt Developer Blog is a place where analytics practitioners can go to share their knowledge with the community. Analytics Engineering is a discipline we’re all building together. The developer blog exists to cultivate the collective knowledge that exists on how to build and scale effective data teams. +We currently have editorial capacity for 10 Community contributed developer blogs per quarter - if we are oversubscribed we suggest you post on another platform or hold off until the editorial team is ready to take on more posts. + ### What makes a good developer blog post? - The short answer: Practical, hands on analytics engineering tutorials and stories diff --git a/contributing/lightbox.md b/contributing/lightbox.md new file mode 100644 index 00000000000..baccbe769f7 --- /dev/null +++ b/contributing/lightbox.md @@ -0,0 +1,27 @@ + +You can use the Lightbox component to add an image or screenshot to your page. It supports clickable images that expand when clicked. + +## Available Props: + +| **Prop** | **Info** | **Required?** | **Type** | +|--------|--------|-------------|--------| +| **src** | The path to the image. For example, `"/img/hamburger-icon.png" | Required | string | +| **alt** | Set the alt text for the image. This will not show text underneath the image. | Optional | string | +| **title** | This sets the text displayed underneath the image.

If the `alt` prop is not set, but `title` is, the title will be used as the alt text as well.| Optional | string | +| **collapsed** | Set the images to be inline, rather than stacked.

This is useful for when displaying two images side by side ([example](/docs/cloud/manage-access/set-up-sso-azure-active-directory)), or when displaying the image inline with a sentence. | Optional | boolean (true/false) | +| **width** | Set a custom width for the image.

Accepts `px` and `%` values. For example: `600px` , `100%` | Optional (Defaults to max width of 400px) | | +| **alignment** | Determine if image should be left, center, or right aligned.

Accepts `left`, `right`, `center`. If any other value is entered, it defaults to `center`| Optional (Defaults to center) | | + +## Example usage + +
+src="/img/hamburger-icon.jpg"
+lt="Alt text"
+title="This text is visible"
+collapsed={true}
+width="600px"
+alignment="left"
+/> + + + diff --git a/contributing/operating-model/outline.md b/contributing/operating-model/outline.md new file mode 100644 index 00000000000..44d1b3dc46a --- /dev/null +++ b/contributing/operating-model/outline.md @@ -0,0 +1,58 @@ +## Summary + +The operating model clarifies how to work with us and shows how we function, and increases our visibility and awareness. Here’s an outline that we can outline at Write the Docs in Portland 2023. (Our very first team offsite!) + +## What do we want to accomplish? + +1. Our Why? Define our strategy. Clearly define the goals that will drive the rest of the planning process. This is our mission statement, our voice and tone, and our statement of purpose. +2. How we work. Tease out the internal team parts of our operating model. This part translates strategic intent into operational capabilities. Provides a foundation for action, and a clear guide for those who join our team, but also a window for others to better understand us. Includes who we are, our culture, our stakeholders. +3. How to work with us. Define our external facing operations and processes. This will provide a clear guide for other teams so they understand what is expected of them, adn what to expect of us. Examples might take shape as: Process Maps, procedures, guidelines, work instructions, standards, tools, and more. + +## Shaping our **Operating model** + +### Strategy + +- [Style guide](https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/content-style-guide.md) +- Contribution guidelines +- Voice & tone + _What is our Voice & Tone?_ + + > From the Community: Stick with it and, soon enough, you'll be able to harness the full power of dbt to achieve your data goals. Stay persistent, stay positive, and (most importantly) believe in yourself. You've got this! + > + + > 💡Gentle and wise, but also reassuring and positive. + > + +### People + +- **Team structure:** What roles do we all have and who should you ask what? We are OSS maintainers as well as a Docs team +- **Culture:** Who are we and how do we conduct ourselves? +- **Key stakeholders:** Internal or external customers include OS community, dbt Community, Townies (PMs, partnerships, revenue, SAs, Customer success, Training) + +### Processes + +- **Important processes we use to** perform our work, reviewing PRs (Townie and Community). Ideas for docs: + + How we work: + + - How we prioritize our work + - How do we work in the docs repo and dbt community slack? (I feel like we should have a general understanding of what work we do in the repo and slack)? + + How to work with us: + + - How to request help from Docs / open an issue in GitHub. Great [data team example](https://www.notion.so/0dcd2d1375274ce9bfac609f3a4a8f6d) here! + - How to communicate with community prod docs team + - How to learn the style guide + - How do we work with other closely related teams like Developer experience? (Who to reach out to for what?) +- **Communication:** What is our SLA (48hours?), how we handle Slack channels, what to ask in Slack vs GitHub issues vs Docs team projects, Docs Office hours (when we’re ready) +- **Information we produce and consume**: Systems that we use to do our work, where is it stored, how can you contribute to it? + +### Data + +- Key **quantitative** metrics that we use to measure success. +- Key **qualitative** metrics that we use to measure success and make decisions. + +### Tech/product + +- Technology: Where do we work and what are the systems in which we work? +- Product: How does state of the product and changes to the product affect our team? diff --git a/contributing/single-sourcing-content.md b/contributing/single-sourcing-content.md index 1c47b87b977..7c345a6631a 100644 --- a/contributing/single-sourcing-content.md +++ b/contributing/single-sourcing-content.md @@ -5,7 +5,7 @@ * [Versioning entire pages](#versioning-entire-pages) * [Versioning blocks of content](#versioning-blocks-of-content) * [Using global variables](#using-global-variables) -* [Reusing snippets of content](#reusing-snippets-of-content) +* [Reusing content](#reusing-content) ## About versioning @@ -15,9 +15,9 @@ Versions are managed in the `versions` array located in the `website/dbt-version ### Adding a new version -To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. +To add a new version to the site, a new object must be added to the `versions` array in the same format as existing versions. This object holds two properties: **version** and **EOLDate (See End of Life Dates below)**. -Example Version: +Example Version: ```jsx exports.versions = [ @@ -28,15 +28,15 @@ exports.versions = [ ] ``` -The **version** property is the value which shows in the nav dropdown. This value is compared to the VersionBlock component on a docs page to determine whether that section should be visible for the current active version (See the **Versioning the Sidebar** section on using the VersionBlock component). +The **version** property is the value shown in the nav dropdown. This value is compared to the VersionBlock component on a docs page to determine whether that section should be visible for the currently active version (See the **Versioning the Sidebar** section on using the VersionBlock component). ### Using end-of-life dates The **EOLDate** property determines when a version is no longer supported. A version is supported up until 1 year after its release. -When a documentation page is viewed, the **EOLDate** property for the active version is compared to today’s date. If the current version has reached, or is nearing the end of support, a banner will show atop the page, notifying the visitor of the end-of-life status. +When a documentation page is viewed, the **EOLDate** property for the active version is compared to today’s date. If the current version has reached or is nearing the end of support, a banner will show atop the page, notifying the visitor of the end-of-life status. -Two different versions of the banner will show depending on the end-of-life date: +Two different versions of the banner will show depending on the end-of-life date: - When the version is within 3 months of the **EOLDate.** - When the version has passed the **EOLDate.** @@ -47,11 +47,11 @@ The content for these two EOLDate banners are located in the `website/src/theme/ ### Versioning entire pages -If a Docs page should not be available for the selected version, it is possible to version the entire page. This is managed in the `versionedPages` array within the `website/dbt-versions.js` file. +If a Docs page is unavailable for the selected version, it is possible to version the entire page. This is managed in the `versionedPages` array within the `website/dbt-versions.js` file. Two things occur when a page is not available for the selected version: -- A banner will appear atop the page, noting this page covers a new feature which isn’t available for the selected version. +- A banner will appear atop the page, noting this page covers a new feature that isn’t available for the selected version. - The page is removed from the sidebar @@ -70,13 +70,13 @@ exports.versionedPages = [ **page** (mandatory): The path of the Docs page to version. This string must match the string for the page in the `sidebars.js` file. -**firstVersion** (optional): Sets the first version which this page is available. +**firstVersion** (optional): Sets the first version on which this page is available. -**lastVersion** (optional): Sets the last version which this page is available. +**lastVersion** (optional): Sets the last version on which this page is available. ## Versioning blocks of content -The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. +The **VersionBlock** component provides the ability to version a specific piece of content on a docs page. This component can be added directly to a markdown file in a similar way as other components (FAQ, File, Lightbox). @@ -90,7 +90,7 @@ This component can be added directly to a markdown file in a similar way as othe Both properties can be used together to set a range where the content should show. In the example below, this content will only show if the selected version is between **0.21** and **1.0**: ```markdown - + Versioned content here @@ -99,7 +99,7 @@ Both properties can be used together to set a range where the content should sho ### Example for versioning entire pages -On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. +On the [Docs Defer page](https://docs.getdbt.com/reference/node-selection/defer), tabs are used to show different versions of a piece of code. **v0.21.0 and later** shows `--select`, while **v-.20.x and earlier** changes this to `--models`. ![oldway](https://user-images.githubusercontent.com/3880403/163254165-dea23266-2eea-4e65-b3f0-c7b6d3e51fc3.png) @@ -116,9 +116,8 @@ $ dbt run --select [...] --defer --state path/to/artifacts $ dbt test --select [...] --defer --state path/to/artifacts \``` - ``` - + You see this version block when the selected version is <= 0.20 ```markdown @@ -144,13 +143,13 @@ Using a global variable requires two steps: 2. Use the **Var** component to add the global variable to a page. ```jsx -// The dbtCore property is the identifer for the variable, +// The dbtCore property is the identifier for the variable, // while the name property is the value shown on the page. exports.dbtVariables = { dbtCore: { name: "dbt Core" - } + } } ``` @@ -199,13 +198,13 @@ In the above example, the **dbtCloud** property has a default name of “dbt Clo ### Global variables example -The global `` component can be used inline, for example: +The global `` component can be used inline, for example: ```markdown This piece of markdown content explains why is awesome. ``` -However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. +However, a Var component cannot start a new line of content. Fortunately, a workaround exists to use the Var component at the beginning of a line of content. To use the component at the beginning of a sentence, add a non-breaking space character before the component: @@ -220,22 +219,102 @@ To use the component at the beginning of a sentence, add a non-breaking space ch is awesome! ``` -## Reusing snippets of content +## Reusing content + +To reuse content on different pages, you can use some techniques like partial files or snippets. Partial files, a built-in Docusaurus feature, is the recommended method over snippets. + +### Partial file + +A partial file allows you to reuse content throughout the docs. Here are the steps you can take to create and use a partial file: + +1. Create a new markdown partial file in the `website/snippets` directory. The file name must begin with an underscore, like `_filename.md` +2. Go back to the docs file that will pull content from the partial file. +3. Add the following import file: `import ComponentName from '/snippets/_this-is-your-partial-file-name.md';` + * You must always add an import file in that format. Note you can name `ComponentName` (a partial component) can be whatever makes sense for your purpose. + * `.md` needs to be added to the end of the filename. +4. To use the partial component, go to the next line and add ``. This fetches the reusable content in the partial file + * Note `anyname` can be whatever makes sense for your purpose. + +You can also use this for more advanced use cases like reusable frontmatter. + +#### Partial example + +1. To create a new partial to use throughout the site, first, we will create a new markdown partial file within the snippets directory: + +```markdown +/snippets/_partial-name.md +``` + +2. Add the following reusable content in the `/snippets/_partial-name.md` partial file: + +```markdown +## Header 2 + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nullam fermentum porttitor dui, id scelerisque enim scelerisque at. +``` + +3. Now, go back to the docs file and add the following code to fetch the reusable content added in the partial file: + +```markdown +Docs content here. + +import SetUpPages from '/snippets/_partial-name.md'; + + + +Docs content here. +``` + +- `import SetUpPages from '/snippets/_partial-name.md';` — A partial file that will be imported by other files +- `` — A component that imports content from the partial file. You can also use it to pass in data into the partial using props (See 'How to use props to pass different content on multiple pages?' below). + +4. This will then render the content of the docs in the partial file. + + +
+ + +
+How to use props to pass different content on multiple pages?
+ +You can add props on the component only if you want to pass in data from the component into the partial file. This is useful for using the same partial component on +multiple docs pages and displaying different values for each. For example, if we wanted to use a partial on multiple pages and pass in a different 'feature' for each +docs page, you can write it as: + +```markdown +import SetUpPages from '/snippets/_available-enterprise-only.md'; + + +``` + +Then in the `/snippets/_available-enterprise-only.md file`, you can display that feature prop with: + +>This feature: `{props.feature}` other content etc... + +This will then translate to: + +>This feature: A really cool feature other content etc... + +In this example, the component ` + +### Snippets -The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. +The Snippet component allows for content to be reusable throughout the Docs. This is very similar to the existing FAQ component. Using partial files, which is a built-in Docusaurus feature, is recommended over snippets. Creating and using a snippet requires two steps: 1. Create a new markdown snippet file in the `website/snippets` directory. -2. Use the `` component within a Docs file. +2. Use the `` component within a Docs file. -### Snippet properties +#### Snippet properties **src:** Expects the file name of the snippet which lives in the snippets directory -### Snippet example +#### Snippet example -To create a new snippet to use throughout the site, first we will create a new markdown snippet within the snippets directory: +To create a new snippet to use throughout the site, first, we will create a new markdown snippet within the snippets directory: ```markdown ## Header 2 @@ -248,7 +327,7 @@ Now, we can add this snippet to a Docs file with the Snippet component: ```markdown Docs content here. - + Docs content here. ``` diff --git a/netlify.toml b/netlify.toml deleted file mode 100644 index 6ab92757410..00000000000 --- a/netlify.toml +++ /dev/null @@ -1,2 +0,0 @@ -[build] - functions = "functions" diff --git a/website/.eslintignore b/website/.eslintignore new file mode 100644 index 00000000000..4c54d7c1fa5 --- /dev/null +++ b/website/.eslintignore @@ -0,0 +1,10 @@ +build +node_modules +cypress +*.test.js +*.cy.js + +sidebars-*.js +docusaurus.config.js +cypress.config.js +static/fonts/** diff --git a/website/.eslintrc.json b/website/.eslintrc.json new file mode 100644 index 00000000000..31bbad8a40c --- /dev/null +++ b/website/.eslintrc.json @@ -0,0 +1,30 @@ +{ + "env": { + "browser": true, + "es2021": true, + "node": true + }, + "rules": { + "@typescript-eslint/no-var-requires": 0 + }, + "extends": ["eslint:recommended", "plugin:@typescript-eslint/recommended", "plugin:markdown/recommended"], + "overrides": [ + { + "files": ["**/*.md"], + "processor": "markdown/markdown" + }, + { + "files": ["**/*.md/*.js", "**/*.md/*.jsx"], + "rules": { + "no-unused-vars": "off", + "no-unused-labels": "off" + } + } + ], + "parser": "@typescript-eslint/parser", + "parserOptions": { + "ecmaVersion": "latest", + "sourceType": "module" + }, + "plugins": ["@typescript-eslint", "prettier", "markdown"] +} diff --git a/website/.gitignore b/website/.gitignore index e0c5b212ea1..9d56e23a488 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -22,3 +22,11 @@ npm-debug.log* yarn-debug.log* yarn-error.log* + +# feeds +/static/feeds/atom.xml +/static/feeds/rss.json +/static/feeds/rss.xml + +# Local Vercel folder +.vercel diff --git a/website/.npmrc b/website/.npmrc new file mode 100644 index 00000000000..2401fc00904 --- /dev/null +++ b/website/.npmrc @@ -0,0 +1,2 @@ +@dbt-labs:registry=https://npm.pkg.github.com +//npm.pkg.github.com/:_authToken=${NPM_TOKEN} diff --git a/website/Makefile b/website/Makefile index 9ba2bd80303..250b23e35bb 100644 --- a/website/Makefile +++ b/website/Makefile @@ -8,15 +8,3 @@ install: build: DOCS_ENV=build npm run build - - # Create per-version redirects... hopefully we can phase these out as - # search engines de-index these absolute versioned URLs - cat ../_redirects > build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.15" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.14" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.13" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.12" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.11" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - cat ../_redirects | grep 'docs/' | awk '{ print "/v0.10" $$1 "\t" $$2 "\t" $$3 }' >> build/_redirects - - cat ../_headers > build/_headers diff --git a/website/api/get-discourse-comments.js b/website/api/get-discourse-comments.js new file mode 100644 index 00000000000..5ac59cfe5f2 --- /dev/null +++ b/website/api/get-discourse-comments.js @@ -0,0 +1,169 @@ +const axios = require('axios') +require("dotenv").config(); + +const { DISCOURSE_DEVBLOG_API_KEY , DISCOURSE_USER_SYSTEM } = process.env +const DEVBLOG_PROD_URL = 'https://docs.getdbt.com/blog/' +const DEV_ENV = 'dev-' +const PREVIEW_ENV = 'deploy-preview-' + +// Set API endpoint and headers +let discourse_endpoint = `https://discourse.getdbt.com` +let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_DEVBLOG_API_KEY, + 'Api-Username': DISCOURSE_USER_SYSTEM, +} + +async function getDiscourseComments(request, response) { + let topicId, comments, DISCOURSE_TOPIC_ID; + + const blogUrl = await getBlogUrl(request) + + if (blogUrl === DEVBLOG_PROD_URL) { + DISCOURSE_TOPIC_ID = 21 + } else { + DISCOURSE_TOPIC_ID = 2 + } + + try { + const env = + blogUrl === DEVBLOG_PROD_URL + ? "" + : blogUrl.includes("localhost") + ? DEV_ENV + : PREVIEW_ENV; + const postTitle = `${env}${request.query.title}`; + const postSlug = request.query.slug; + const cleanSlug = cleanUrl(request.query.slug); + const externalId = truncateString(`${env}${cleanSlug}`); + + console.table({ + blogUrl, + postTitle, + postSlug, + cleanSlug, + externalId, + }); + + + if (!postSlug) throw new Error("Unable to query Discourse API. Error reading slug."); + + topicId = await searchDiscourseExternalId(externalId); + + // First check if the dev blog post exists in Discourse + // Get the comments if it does + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + } else { + // If the dev blog post does not exist in Discourse + // Create a new topic and get the comments + topicId = await createDiscourseTopic(postTitle, externalId, cleanSlug, blogUrl, DISCOURSE_TOPIC_ID); + if (typeof topicId === "number") { + comments = await getDiscourseTopicbyID(topicId); + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } else { + console.log("Unable to create Discourse topic TopicID is not a number."); + return await response.status(500).json({ error: "Unable to create Discourse topic TopicID is not a number." }); + } + } + + comments.shift(); + comments = { topicId, comments }; + + return await response.status(200).json(comments); + } catch (err) { + console.log("err on getDiscourseComments", err); + return await response.status(500).json({ error: "Unable to get topics from Discourse." }); + } +} + +async function createDiscourseTopic(title, externalId, slug, blogUrl, DISCOURSE_TOPIC_ID) { + console.log(`Creating a new topic in Discourse - ${title}`) + try { + const response = await axios.post(`${discourse_endpoint}/posts`, { + title: title, + raw: `This is a companion discussion topic for the original entry at ${blogUrl}${slug}`, + category: DISCOURSE_TOPIC_ID, + embed_url: `${blogUrl}${slug}`, + external_id: externalId, + tags: ['devblog'], + visible: false + }, { headers }) + + let topicId = await response.data.topic_id + + console.log('Topic successfully created with topic_id', topicId) + + return topicId + + } catch(err) { + console.log('err on createDiscourseTopic', err) + return err + } +} + +async function getDiscourseTopicbyID(topicId) { + console.log(`Topic found setting topic id - ${topicId}`) + try { + let response = await axios.get(`${discourse_endpoint}/t/${topicId}.json`, { headers }) + let { data } = await response + let post_stream = data.post_stream + let post_count = data.posts_count + + // If there is more than one comment make the topic visibile in Discourse + if (post_count > 1 && data.visible === false) { + console.log(`Topic has more than one comment. Changing visibility to visible.`) + await axios.put(`${discourse_endpoint}/t/${topicId}`, { + visible: true + }, { headers }) + } + + // Filter only 'regular' posts in Discourse. (e.g. not moderator actions, small_actions, whispers) + post_stream.posts = post_stream.posts.filter(post => post.post_type === 1) + + return post_stream.posts + } catch(err) { + console.log('err on getDiscourseTopicbyID', err) + return err + } +} + +async function searchDiscourseExternalId(externalId) { + console.log(`Searching for external_id in Discourse - ${externalId}`); + try { + const data = await axios.get(`${discourse_endpoint}/t/external_id/${externalId}.json`, { headers }); + return data.data.id; + } catch (err) { + if (err.response.status === 404) { + console.log("No topics found in Discourse."); + return null; + } + console.log("Unable to search Discourse for external_id.", err); + return err; + } +} + + +// Truncate external_id to 50 characters per Discourse API requirements +function truncateString(str) { + if (str.length <= 50) { + return str + } + return str.slice(0, 50) +} + +// Remove query params and hash from URL to prevent duplicate topics +function cleanUrl(url) { + return url.split("?")[0].split("#")[0]; +} + +// Create a function to get the host name from the request and add /blog/ to the end +async function getBlogUrl(req) { + const host = req.headers.host + return `https://${host}/blog/` +} + +module.exports = getDiscourseComments; diff --git a/website/api/get-discourse-topics.js b/website/api/get-discourse-topics.js new file mode 100644 index 00000000000..90d6e5af80e --- /dev/null +++ b/website/api/get-discourse-topics.js @@ -0,0 +1,136 @@ +const axios = require('axios') + +async function getDiscourseTopics(request, response) { + const { DISCOURSE_API_KEY , DISCOURSE_USER } = process.env + + const body = request.body + + try { + // Set API endpoint and headers + let discourse_endpoint = `https://discourse.getdbt.com` + let headers = { + 'Accept': 'application/json', + 'Api-Key': DISCOURSE_API_KEY, + 'Api-Username': DISCOURSE_USER, + } + + const query = buildQueryString(body) + if(!query) throw new Error('Unable to build query string.') + + // Get topics from Discourse + let { data: { posts, topics } } = await axios.get(`${discourse_endpoint}/search?q=${query}`, { headers }) + + // Return empty array if no topics found for search query + // 200 status is used to prevent triggering Datadog alerts + if(!topics || topics?.length <= 0) { + // Log message with encoded query and end function + console.log('Unable to get results from api request.') + console.log(`Search query: ${query}`) + return await response.status(200).json([]) + } + + // Set author and like_count for topics if not querying by specific term + let allTopics = topics + if(!body?.term) { + allTopics = topics.reduce((topicsArr, topic) => { + // Get first post in topic + const firstTopicPost = posts?.find(post => + post?.post_number === 1 && + post?.topic_id === topic?.id + ) + // If post found + // Get username + if(firstTopicPost?.username) { + topic.author = firstTopicPost.username + } + // Get like count + if(firstTopicPost?.like_count) { + topic.like_count = firstTopicPost.like_count + } + + if(firstTopicPost?.blurb) { + topic.blurb = firstTopicPost.blurb + } + + // Push updated topic to array + topicsArr.push(topic) + + return topicsArr + }, []) + } + + // Return topics + //return await returnResponse(200, allTopics) + return await response.status(200).json(allTopics) + } catch(err) { + // Log and return the error + console.log('err', err) + return await response.status(500).json({ error: 'Unable to get topics from Discourse.'}) + } +} + +function buildQueryString(body) { + if(!body) return null + + // start with empty query string + let query = '' + + // check param and apply to query if set + for (const [key, value] of Object.entries(body)) { + // validate categories + // if valid, add to query string + if(validateItem({ key, value })) { + if(key === 'category') { + query += `#${value} ` + } else if(key === 'inString') { + query += `in:${value}` + } else if(key === 'status' && Array.isArray(value)) { + value?.map(item => { + query += `${key}:${item} ` + }) + } else { + query += `${key}:${value} ` + } + } + } + + if(query) { + const encodedQuery = encodeURIComponent(query) + return encodedQuery + } +} + +function validateItem({ key, value }) { + // predefined Discourse values + // https://docs.discourse.org/#tag/Search/operation/search + const inStringValues = ['title', 'first', 'pinned', 'wiki'] + const orderValues = ['latest', 'likes', 'views', 'latest_topic'] + const statusValues = ['open', 'closed', 'public', 'archived', 'noreplies', 'single_user', 'solved', 'unsolved'] + + // validate keys + if(key === 'inString') { + return inStringValues.includes(value) + ? true + : false + } else if(key === 'order') { + return orderValues.includes(value) + ? true + : false + } else if(key === 'status') { + if(Array.isArray(value)) { + let isValid = true + value?.map(item => { + if(!statusValues.includes(item)) isValid = false + }) + return isValid + } else { + return statusValues.includes(value) + ? true + : false + } + } else { + return true + } +} + +module.exports = getDiscourseTopics diff --git a/website/blog/2019-05-01-how-we-structure-dbt-projects.md b/website/blog/2019-05-01-how-we-structure-dbt-projects.md deleted file mode 100644 index 8e1d4f59e32..00000000000 --- a/website/blog/2019-05-01-how-we-structure-dbt-projects.md +++ /dev/null @@ -1,263 +0,0 @@ ---- -title: "How We Structure our dbt Projects" -description: "As the maintainers of dbt, and analytics consultants, at Fishtown Analytics we build a lot of dbt projects. Over time, we’ve developed internal conventions on how we structure them." -slug: how-we-structure-our-dbt-projects -canonical_url: https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355 - -authors: [claire_carroll] - -tags: [dbt tutorials] -hide_table_of_contents: false - -date: 2019-05-01 -is_featured: true ---- - - -As the maintainers of dbt, and analytics consultants, at [Fishtown Analytics](https://www.getdbt.com/dbt-labs/about-us/) (now dbt Labs) we build a lot of dbt projects. Over time, we’ve developed internal conventions on how we structure them. - -This article does not seek to instruct you on how to design a final model for your stakeholders — it won’t cover whether you should denormalize everything into one wide master , or have many tables that need to be joined together in the BI layer. There are entire books on this topic. Instead, use this as a guide once you’ve already got an idea of what you’re building for how you should break the transformations up into separate dbt models. - - - -It’s important to note that **this is not the only, or the objectively best, way to structure a dbt project**. Rather, this document reflects our current opinions. These opinions are strongly influenced by: - -* our views on data model design; which in turn are influenced by: -* the kinds of analytics problems we are solving for clients -* the data stack we typically work within, in which multiple data sources are loaded by third party tools, and the is optimized for analytical queries (therefore we aren’t tightly bounded by performance optimization considerations). - -Our opinions are **almost guaranteed to change over time** as we update our views on modeling, are exposed to more analytics problems, and data stacks evolve. It’s also worth clearly stating here: the way we structure dbt projects makes sense for our projects, but may not be the best fit for yours! This article exists on Discourse so that we can have a conversation – I would love to know how others in the community are structuring their projects. - -In comparison, the (recently updated) [best practices](/guides/best-practices) reflect principles that we believe to be true for any dbt project. Of course, these two documents go hand in hand – our projects are structured in such a way that makes the those principles easy to observe, in particular: - -* Limit references to raw data -* Rename and recast fields once -* Group your models in directories -* Add tests to your models -* Consider the information architecture of your data warehouse -* Separate source-centric and business-centric transformations - -We also recently held (and recorded) an office hours on this topic – this article provides a high level outline, but there’s a lot more detail and discussion in the [video](https://youtu.be/xzKLh342s08). - -Lastly, before I dive in, a huge thank you to Jeremy Cohen for not only teaching me a lot of the material in this article, but also for doing a lot of the groundwork that went into this article – entire sections of this article are in fact lifted from his work. - -## Data transformation 101 ------------------------------------------------------ - -The data in any of our projects has three distinct checkpoints: - -1. **Sources**: Schemas and tables in a source-conformed structure (i.e. tables and columns in a structure based on what an API returns), loaded by a third party tool. -2. **Staging models**: The atomic unit of data modeling. Each model bears a one-to-one relationship with the source data table it represents. It has the same granularity, but the columns have been renamed, recast, or usefully reconsidered into a consistent format. -3. **Marts models**: Models that represent business processes and entities, abstracted from the data sources that they are based on. - -In a simple project, these may be the only models you build; more complex projects may have a number of intermediate models that help along this journey, as well as accessories to these models (see below). - -Still confused? An example might help! Let’s think about a software business that uses both Stripe and Braintree to collect subscription payments. Their three stages of modeling might look like: - -1. **Sources**: Payment records from the Stripe API and payment records from the Braintree API, loaded into their data warehouse by a third party tool. -2. **Staging models**: Both the Stripe and Braintree payments are recast into a consistent shape, with consistent column names. -3. **Marts models**: A monthly recurring revenue (MRR) model that classifies revenue per customer per month as new revenue, upgrades, downgrades, and churn, to understand how a business is performing over time. It may be useful to note whether the revenue was collected via Stripe or Braintree, but they are not fundamentally separate models. - -Of note here is that there is a distinct change that occurs between the staging and marts checkpoints – sources and staging models are source-centric, whereas marts models are business-centric. - -In our dbt projects, this leads us to our first split in our `models/` directory which helps us make this distinction: - -``` - ├── dbt_project.yml - └── models - ├── marts - └── staging - -``` - -## Staging raw data ---------------------------------------- - -The goal of the staging layer is to create staging models. Staging models take raw data, and clean and prepare them for further analysis. For a user querying the data warehouse, a relation with a `stg_` prefix indicates that: - -* Fields have been renamed and recast in a consistent way.¹ -* Datatypes, such as timezones, are consistent. -* Light cleansing, such as replacing empty string with NULL values, has occurred. -* If useful, flattening of objects might have occurred. -* There is a that is both unique and not null (and tested). - -Staging models can have joins in them to field additional columns for context or enrichment; add rows through unions and remove them through filters; deduplicate a natural key or hash together a [surrogate one](/blog/sql-surrogate-keys). - -Because we often work with multiple data sources, in our `staging` directory, we create one directory per source. - - -``` - ├── dbt_project.yml - └── models - ├── marts - └── staging - ├── braintree - └── stripe -``` - -Each staging directory contains at a minimum: - - -* One staging model for each object that is useful for analytics: - * Named `stg___`. - * Generally materialized as a (unless performance requires it as a table). -* A `src_.yml` file which contains: - * [Source](/docs/building-a-dbt-project/using-sources) definitions, tests, and documentation -* A `stg_.yml` file which contains - * [Tests](/docs/build/tests) and [documentation](/docs/building-a-dbt-project/documentation) for models in the same directory - -``` - ├── dbt_project.yml - └── models - ├── marts - └── staging - └── braintree - ├── src_braintree.yml - ├── stg_braintree.yml - ├── stg_braintree__customers.sql - └── stg_braintree__payments.sql -``` - - -Some dbt users prefer to have one `.yml` file per model (e.g. `stg_braintree__customers.yml`). This is a completely reasonable choice, and we recommend implementing it if your `.yml` files start to become unwieldy. - -### But what about base models? - -Earlier versions of the dbt documentation recommended implementing “base models” as the first layer of transformation – and we used to organize and name our models in this way, for example `models/braintree/base/base_payments.sql`. - -We realized that while the reasons behind this convention were valid, the naming was an opinion, so in our recent update to the [best practices](/guides/best-practices), we took the mention of base models out. Instead, we replaced it with the principles of “renaming and recasting once” and “limiting the dependencies on raw data”. - -That being said, in our dbt projects every source flows through exactly one model of the following form: - -``` - with source as ( - - select * from {{ source('braintree', 'payments') }} - - ), - - renamed as ( - - select - id as payment_id, - order_id, - convert_timezone('America/New_York', 'UTC', createdat) as created_at, - ... - - from source - - ) - - select * from renamed -``` - -We still refer to this a base transformation. If your source data is in good shape, this transformation may be all that’s required to build a staging model, and our staging model is this SQL. - -However, building a staging model may warrant several models’ worth of cleaning, correcting, and categorizing, or may require a join or union to another source. To ensure our data source flows through a base transformation, we extend our DAG upstream of the staging model, by creating a separate base model, that we then select from. - -![27_AM](/img/blog/dbt-project-structure-318751d1b7d09a0abc1627a7f6d219d3a8c0455b_2_690x173.jpeg) - -In our dbt projects, we place these base models in a nested `base` subdirectory. - -``` - ├── dbt_project.yml - └── models - ├── marts - └── staging - └── braintree - ├── base - | ├── base.yml - | ├── base_braintree__failed_payments.sql - | └── base_braintree__successful_payments.sql - ├── src_braintree.yml - ├── stg_braintree.yml - ├── stg_braintree__customers.sql - └── stg_braintree__payments.sql -``` - -In our projects, base models: - -* Often use the ephemeral materialization, so they are not exposed to end users querying our warehouse. -* Are tested in a `base.yml` file within the same directory as the base models. - -If we need additional transformations between base and staging models, we create a nested `staging//intermediate` directory and place these transformations in there. - -## Describing a business through `marts` -------------------------------------------------------------------------------- - -Marts are stores of models that describe business entities and processes. They are often grouped by business unit: marketing, finance, product. Models that are shared across an entire business are grouped in a core directory. - -``` - ├── dbt_project.yml - └── models - ├── marts - | ├── core - | ├── finance - | ├── marketing - | └── product - └── staging -``` - -There are entire books written on how to design models, which is beyond the scope of this article. In our view of the world, our goal is to build fact and dimension models, that are abstracted from the source data that they rely upon: - -* `fct_`**:** A tall, narrow table representing real-world processes that have occurred or are occurring. The heart of these models is usually an immutable event stream: sessions, transactions, orders, stories, votes. -* `dim_`: A wide, short table where each row is a person, place, or thing; the ultimate source of truth when identifying and describing entities of the organization. They are mutable, though slowly changing: customers, products, candidates, buildings, employees. - -Where the work of staging models is limited to cleaning and preparing, fact tables are the product of substantive data transformation: choosing (and reducing) dimensions, date-spining, executing business logic, and making informed, confident decisions. - -This layer of modeling is considerably more complex than creating staging models, and the models we _design_ are highly tailored to the analytical needs of an organization. As such, we have far less convention when it comes to these models. Some patterns we’ve found to be useful are: - -* `fct_` and `dim_` models should be materialized as tables within a warehouse to improve query performance. As a default, we use the table materialization, and where performance requires it, we use the incremental materialization. -* Intermediate transformations required to get to a fact or dimension model are placed in a nested `marts//intermediate` directory. They are named `__.sql`. The lack of prefix and use of double underscores indicates that these are intermediate models, not to be trusted, however, it may also be worth hiding these in a different [schema](/docs/building-a-dbt-project/building-models/using-custom-schemas). -* Models are tested and documented in a `.yml` file in the same directory as the models. -* Any extra documentation in a [docs block](/docs/building-a-dbt-project/documentation#using-docs-blocks) is placed in a `.md` file in the same directory. - -A marts directory may therefore end up looking like: - -``` - ├── dbt_project.yml - └── models - ├── marts - │ ├── core - │ │ ├── core.md - │ │ ├── core.yml - │ │ ├── dim_customers.sql - │ │ ├── fct_orders.sql - │ │ └── intermediate - │ │ ├── customer_orders__grouped.sql - │ │ ├── customer_payments__grouped.sql - │ │ ├── intermediate.yml - │ │ └── order_payments__joined.sql - │ ├── finance - │ ├── marketing - │ └── product - └── staging -``` - -This entire project results in the following DAG: - -![12_AM](/img/blog/dbt-project-structure-a5567e3711a30c72bbf9c117548452fac476d8c6_2_690x160.jpeg) - -## Accessories to data -------------------------------------------------- - -There are other kinds of SQL files that find their way into robust dbt projects. In addition to `staging` and `marts`, we find ourselves with model directories such as: - -* `utils`: An `all_days` table. This is useful everywhere, though it never forms the basis for analysis/reporting. -* `lookups`**:** A user-mapping table, a zipcode-country table, etc. These are as likely to be [CSV seeds](/docs/building-a-dbt-project/seeds) as tables in a production database. You may reference it at several unpredictable points throughout modeling, and maybe even in a BI tool. -* `admin`**:** Audit logs, warehouse operations, Redshift maintenance, and incremental records of the miscellaneous you run to make your project run smoothly. -* `metrics`**:** Precisely defined measurements taken from fact tables, directly conducive to time-series reporting, and tightly structured so as to allow one-to-one comparison with goals and forecasting. A metrics table lives downstream of dimension and fact tables in your DAG, and it deserves special status. -* **Packages:** While not a model folder within your main project, packages that include models (like our [snowplow](https://github.com/dbt-labs/snowplow) package) can be configured into custom schema and materialization patterns from `dbt_project.yml`. - -In projects where we find ourselves with these additional models, we often leverage [custom schemas](/docs/building-a-dbt-project/building-models/using-custom-schemas) as directories in our warehouse, to logically group the models, choosing a schema name that matches the directory name in our dbt project. - -## Final thoughts ------------------------------------ - -In this article, building the DAG for a dbt project has been described left to right, starting at sources, and ending with marts models. - -However, it’s worth noting that in reality we often first think through a modeling problem from right to left — we start with an idea of the dashboard or report we want to build, then whiteboard the structure of the marts model we need in our warehouse to power this dashboard. On the same whiteboard, we’ll often then work backwards until we reach our source, before we start writing any actual SQL. I’ve found that it’s only once I’ve solved a modeling problem a few times that I get an intuition for how to build a DAG from left to right. In other words: we tend to think about our destination before we start our modeling journey. - -* * * - -¹We’ve standardized our naming and type conventions in our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/master/dbt_coding_conventions.md). diff --git a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md index 944d6fdd3f9..cdfd4da5f5d 100644 --- a/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md +++ b/website/blog/2020-07-01-how-to-create-near-real-time-models-with-just-dbt-sql.md @@ -13,6 +13,13 @@ date: 2020-07-01 is_featured: false --- +:::caution More up-to-date information available + +Since this blog post was first published, many data platforms have added support for [materialized views](/blog/announcing-materialized-views), which are a superior way to achieve the goals outlined here. We recommend them over the below approach. + + +::: + Before I dive into how to create this, I have to say this. **You probably don’t need this**. I, along with my other Fishtown colleagues, have spent countless hours working with clients that ask for near-real-time streaming data. However, when we start digging into the project, it is often realized that the use case is not there. There are a variety of reasons why near real-time streaming is not a good fit. Two key ones are: 1. The source data isn’t updating frequently enough. diff --git a/website/blog/2021-02-05-dbt-project-checklist.md b/website/blog/2021-02-05-dbt-project-checklist.md index 6100e7ff089..dbe2c10f408 100644 --- a/website/blog/2021-02-05-dbt-project-checklist.md +++ b/website/blog/2021-02-05-dbt-project-checklist.md @@ -53,8 +53,8 @@ This post is the checklist I created to guide our internal work, and I’m shari **Useful links**: -* [.yml files](/docs/building-a-dbt-project/using-sources/#testing-and-documenting-sources) -* [Materializations](/docs/building-a-dbt-project/building-models/materializations/#configuring-materializations) +* [.yml files](/docs/build/sources#testing-and-documenting-sources) +* [Materializations](/docs/build/materializations/#configuring-materializations) * [YAML selectors](/reference/node-selection/yaml-selectors/) ## ✅ Package Management @@ -67,7 +67,7 @@ This post is the checklist I created to guide our internal work, and I’m shari **Useful links** -* [Packages Docs](/docs/building-a-dbt-project/package-management/) +* [Packages Docs](/docs/build/packages/) * [Package Hub](https://hub.getdbt.com/) * [dbt utils package](https://github.com/dbt-labs/dbt-utils) @@ -136,10 +136,10 @@ This post is the checklist I created to guide our internal work, and I’m shari **Useful links** * [dbt release version](https://github.com/dbt-labs/dbt/releases) -* [Sources](/docs/building-a-dbt-project/using-sources/) +* [Sources](/docs/build/sources/) * [Refs](/reference/dbt-jinja-functions/ref/) * [tags](/reference/resource-configs/tags/) -* [Jinja docs](/docs/building-a-dbt-project/jinja-macros) +* [Jinja docs](/guides/advanced/using-jinja) ## ✅ Testing & Continuous Integration ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- @@ -173,8 +173,8 @@ This post is the checklist I created to guide our internal work, and I’m shari Useful Links -* [FAQs for documentation](/docs/building-a-dbt-project/documentation/#faqs) -* [Doc blocks](/docs/building-a-dbt-project/documentation/#using-docs-blocks) +* [FAQs for documentation](/docs/collaborate/documentation#faqs) +* [Doc blocks](/docs/collaborate/documentation#using-docs-blocks) ## ✅ dbt Cloud specifics ---------------------------------------------------------------------------------------------------------------------------------------------------------- @@ -200,7 +200,7 @@ Are you using the IDE and if so, how well? **Useful links** -* [dbt Cloud as a CI tool](/docs/deploy/cloud-ci-job) +* [dbt Cloud as a CI tool](/docs/deploy/continuous-integration) ## ✅ DAG Auditing @@ -252,9 +252,9 @@ Thanks to Christine Berger for her DAG diagrams! **Useful links** -* [How Fishtown Structures our dbt Project](/blog/how-we-structure-our-dbt-projects/) +* [How we structure our dbt Project](/guides/best-practices/how-we-structure/1-guide-overview) * [Coalesce DAG Audit Talk](https://www.youtube.com/watch?v=5W6VrnHVkCA&t=2s) * [Modular Data Modeling Technique](https://getdbt.com/analytics-engineering/modular-data-modeling-technique/) -* [Understanding Threads](/dbt-cli/configure-your-profile/#understanding-threads) +* [Understanding Threads](/docs/running-a-dbt-project/using-threads) This is a quick overview of things to think about in your project. We’ll keep this post updated as we continue to refine our best practices! Happy modeling! diff --git a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md index 60dc91278d2..50d09625436 100644 --- a/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md +++ b/website/blog/2021-02-09-how-to-configure-your-dbt-repository-one-or-many.md @@ -60,7 +60,7 @@ This is the most common structure we see for dbt repository configuration. Thoug **Strengths** * Easy to share and maintain the same core business logic -* Full dependency lineage - your dbt generated DAG encompasses all of your data transformations for your entire company +* Full dependency lineage - your dbt generated DAG encompasses all of your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) for your entire company **Weaknesses** @@ -80,7 +80,7 @@ This is our most time tested option and our most recommended. However, we have s ![separate repository](/img/blog/monorepo-3d6f91c1ab275d953417d2239f66e8f81bad7078_2_600x217.png) -This is one of the first structures we see people move toward when they “outgrow” the mono repo: there is one “core” repository that is incorporated into team specific repositories as a package. If you aren’t familiar with packages, [see the documentation](https://docs.getdbt.com/docs/building-a-dbt-project/package-management/) for more information. +This is one of the first structures we see people move toward when they “outgrow” the mono repo: there is one “core” repository that is incorporated into team specific repositories as a package. If you aren’t familiar with packages, [see the documentation](https://docs.getdbt.com/docs/build/packages/) for more information. How would the above function? While each team would work in their own repository, they would put shared items into the shared repository which is then installed in as a package to their repository. Some common things to put into that shared repository would be: @@ -106,7 +106,7 @@ What doesn’t go into that shared repository? * Maintaining downstream dependencies of macros and models. There is a need to create a CI/CD process that assures changes in the shared repository will not negatively impact the downstream repositories. It’s possible that you will have to introduce [semantic versioning](https://en.wikipedia.org/wiki/Software_versioning) to mitigate miscommunication about breaking changes. * Incomplete lineage/documentation for objects not the shared repository -This is the option I recommend the most when one must stray away from Option 2. This follows our [dbt viewpoint](https://docs.getdbt.com/docs/about/viewpoint/#analytics-is-collaborative) the best in terms of dry code and collaboration as opposed to Option 3 & 4. +This is the option I recommend the most when one must stray away from Option 2. This follows our [dbt viewpoint](/community/resources/viewpoint#analytics-is-collaborative) the best in terms of dry code and collaboration as opposed to Option 3 & 4. ## Option 3: Completely Separate Repositories ------------------------------------------------------------------------------------------ @@ -159,4 +159,4 @@ All of the above configurations “work”. And as detailed, they each solve for 2. Figure out what may be a pain point in the future and try to plan for it from the beginning. 3. Don’t over-complicate things until you have the right reason. As I said in my Coalesce talk: **don’t drag your skeletons from one closet to another** 💀! -**Note:** Our attempt in writing guides like this and [How we structure our dbt projects](/blog/how-we-structure-our-dbt-projects) aren’t to try to convince you that our way is right; it is to hopefully save you the hundreds of hours it has taken us to form those opinions! +**Note:** Our attempt in writing guides like this and [How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) aren’t to try to convince you that our way is right; it is to hopefully save you the hundreds of hours it has taken us to form those opinions! diff --git a/website/blog/2021-09-11-sql-dateadd.md b/website/blog/2021-09-11-sql-dateadd.md index ad6ae7d7b08..23e5a29dcb9 100644 --- a/website/blog/2021-09-11-sql-dateadd.md +++ b/website/blog/2021-09-11-sql-dateadd.md @@ -2,6 +2,7 @@ title: "DATEADD SQL Function Across Data Warehouses" description: "DATEADD Function syntax varies across data warehouses. Learn how to standardize your syntax no matter the container." slug: sql-dateadd +canonical_url: https://docs.getdbt.com/sql-reference/dateadd authors: david_krevitt diff --git a/website/blog/2021-09-11-union-all.md b/website/blog/2021-09-11-union-all.md index b85645e7da7..036e46a0abd 100644 --- a/website/blog/2021-09-11-union-all.md +++ b/website/blog/2021-09-11-union-all.md @@ -76,7 +76,7 @@ As you can imagine, this gets super painful when we want to add a new column wit ## Enter the union_relations dbt macro -dbt allows you to write [macro functions in Jinja](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) to automate away this chore. +dbt allows you to write [macro functions in Jinja](/docs/build/jinja-macros) to automate away this chore. The [union_relations](https://github.com/dbt-labs/dbt-utils#union_relations-source) macro in the [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) package completely frees us from propagating null or 0 values for each column that doesn’t exist in the other tables that we’re UNIONing. diff --git a/website/blog/2021-09-15-september-21-product-email.md b/website/blog/2021-09-15-september-21-product-email.md index 9d8f6b76606..a3c9993befa 100644 --- a/website/blog/2021-09-15-september-21-product-email.md +++ b/website/blog/2021-09-15-september-21-product-email.md @@ -4,7 +4,6 @@ description: "dbt v1.0 is coming up! Don't forget to update your projects to the slug: dbt-product-update-2021-september authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-09-15 @@ -34,7 +33,7 @@ Give Jeremy a win and check out the [blog](http://blog.getdbt.com/getting-ready ### dbt v0.21.0-rc1 - Check out the [#dbt-prereleases](https://getdbt.slack.com/archives/C016X6ABVUK?utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-8nIpohDBSr7SvpXrqY-5ONmnjdIgW0XMiAPkjQTb9Pgwt24nzqAWNX2Xgtj8LA0LrPoHpD) channel in the dbt Community Slack, and Jeremy's [Discourse post](https://discourse.getdbt.com/t/prerelease-dbt-core-v0-21-louis-kahn/3077?utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-8nIpohDBSr7SvpXrqY-5ONmnjdIgW0XMiAPkjQTb9Pgwt24nzqAWNX2Xgtj8LA0LrPoHpD)! - dbt build: Did you catch our teaser last month at [Staging](https://www.youtube.com/watch?v=-XRD_IjWX2U&utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-8nIpohDBSr7SvpXrqY-5ONmnjdIgW0XMiAPkjQTb9Pgwt24nzqAWNX2Xgtj8LA0LrPoHpD)? -- Defining resource configs in all the places you'd expect (i.e. yaml files) +- Defining resource configs in all the places you'd expect (i.e. YAML files) - Capture changes to macros in state:modified, for best-yet Slim CI ![Screen Shot 2021-09-20 at 11.34.47 AM (1)](https://hs-8698602.f.hubspotemail.net/hub/8698602/hubfs/Screen%20Shot%202021-09-20%20at%2011.34.47%20AM%20(1).png?upscale=true&width=600&upscale=true&name=Screen%20Shot%202021-09-20%20at%2011.34.47%20AM%20(1).png) diff --git a/website/blog/2021-10-15-october-21-product-update-email.md b/website/blog/2021-10-15-october-21-product-update-email.md index 2affc5a837e..c235e43bf43 100644 --- a/website/blog/2021-10-15-october-21-product-update-email.md +++ b/website/blog/2021-10-15-october-21-product-update-email.md @@ -1,10 +1,9 @@ --- title: "October 2021 dbt Update: Metrics and Hat Tricks 🎩" -description: "Also flagging that Coalesce is less than 3 weeks away! 😱" +description: "Stay up-to-date with the latest features in dbt. Read about our October 2021 product update." slug: dbt-product-update-2021-october authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-10-15 diff --git a/website/blog/2021-11-15-november-21-product-email.md b/website/blog/2021-11-15-november-21-product-email.md index 3c28187f739..dd5d2b63956 100644 --- a/website/blog/2021-11-15-november-21-product-email.md +++ b/website/blog/2021-11-15-november-21-product-email.md @@ -1,10 +1,9 @@ --- title: "November 2021 dbt Update: v1.0, Environment Variables, and a Question About the Size of Waves 🌊" -description: "Also flagging that Coalesce is less than 3 weeks away! 😱" +description: "Stay up-to-date with the latest features in dbt. Read about our November 2021 product update." slug: dbt-product-update-2021-november authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2021-11-15 @@ -60,7 +59,7 @@ Hear their take, and share your own by [registering here](https://coalesce.getdb ### Things to Listen To 🎧 -- Julien Le Dem joined the [Analytics Engineer Podcast](https://roundup.getdbt.com/p/ep-10-why-data-lineage-matters-w?utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-9SoWbfj9_ZRDew6i8p8yand1JSmLh7yfridIrLwO7bgHTUmnbKcRp3AEKCO8pOytotdxAo) to talk about how OS projects become standards, and why data lineage in particular is in need of an open standard.  +- Julien Le Dem joined the [Analytics Engineer Podcast](https://roundup.getdbt.com/p/ep-10-why-data-lineage-matters-w?utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-9SoWbfj9_ZRDew6i8p8yand1JSmLh7yfridIrLwO7bgHTUmnbKcRp3AEKCO8pOytotdxAo) to talk about how OS projects become standards, and why in particular is in need of an open standard.  - [The rise of the Analytics Engineer](https://youtu.be/ixyzF4Dy9Us?utm_campaign=Monthly%20Product%20Updates&utm_source=hs_email&utm_medium=email&_hsenc=p2ANqtz-9SoWbfj9_ZRDew6i8p8yand1JSmLh7yfridIrLwO7bgHTUmnbKcRp3AEKCO8pOytotdxAo): Anna, dbt Labs Director of Community, joined Thoughtspot to talk about the evolution of analytics engineering, or the emergence of the "full stack data analyst." diff --git a/website/blog/2021-11-22-dbt-labs-pr-template.md b/website/blog/2021-11-22-dbt-labs-pr-template.md index 4460b27d494..40d4960ac18 100644 --- a/website/blog/2021-11-22-dbt-labs-pr-template.md +++ b/website/blog/2021-11-22-dbt-labs-pr-template.md @@ -80,7 +80,7 @@ By including a screenshot of your dbt test run here, you are confirming that you #### Changes to existing models: -This is a place to leave post-merge instructions. Maybe you updated your existing [incremental model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models) with an additional column and need to run a [full refresh](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#how-do-i-rebuild-an-incremental-model). +This is a place to leave post-merge instructions. Maybe you updated your existing [incremental model](https://docs.getdbt.com/docs/build/incremental-models) with an additional column and need to run a [full refresh](https://docs.getdbt.com/docs/build/incremental-models#how-do-i-rebuild-an-incremental-model). Or, maybe you have a corresponding PR for your BI tool that needs to be merged to accommodate your dbt modeling changes. @@ -111,7 +111,7 @@ By default, all new models should have _at least_ unique and not null tests on t Documentation follows the same reasoning as the PR description. You will know more **right now** about the intricacies of these models than you will after you’ve developed 50 more models in the coming months. -**I have [materialized my models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations) appropriately.** +**I have [materialized my models](https://docs.getdbt.com/docs/build/materializations) appropriately.** This is all about performance. Our ultimate goal is to model data such that end users can easily and efficiently query the resulting database objects. diff --git a/website/blog/2021-11-22-sql-surrogate-keys.md b/website/blog/2021-11-22-sql-surrogate-keys.md index 05422ba3506..75e6d591f24 100644 --- a/website/blog/2021-11-22-sql-surrogate-keys.md +++ b/website/blog/2021-11-22-sql-surrogate-keys.md @@ -24,7 +24,7 @@ We all know one of the most fundamental rules in data is that every data warehouses, and how you can use one simple dbt macro ([dbt_utils.surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source)) to abstract away the null value problem. +Let’s take a look at how generating surrogate keys specifically looks in practice across data warehouses, and how you can use one simple dbt macro ([dbt_utils.generate_surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source)) to abstract away the null value problem. ### A surrogate_key macro to the rescue -Thanks to a handy function called [surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) in the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/), you can fire yourself from the business of wrapping your columns in `coalesce` every time you want to generate a surrogate key. +Thanks to a handy function called [generate_surrogate_key](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source) in the [dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/), you can fire yourself from the business of wrapping your columns in `coalesce` every time you want to generate a surrogate key. Forming your surrogate keys with this macro has the benefit of **elegant + null handling**. Rather than wrapping your columns in a `coalesce` function when concatenating them, the macro loops through your columns and _coalesces_ on your behalf, so that you can avoid repeating yourself. -When you call `{{ dbt_utils.surrogate_key(['field_a', 'field_b'[,...]]) }}`, behind the scenes dbt compiles SQL on your behalf, looping through each field and generating the correct number of `coalesce` statements with type casting: +When you call `{{ dbt_utils.generate_surrogate_key(['field_a', 'field_b'[,...]]) }}`, behind the scenes dbt compiles SQL on your behalf, looping through each field and generating the correct number of `coalesce` statements with type casting: ```sql - coalesce(cast(" ~ field ~ " as " ~ dbt_utils.type_string() ~ "), '') + coalesce(cast(" ~ field ~ " as " ~ dbt.type_string() ~ "), '_dbt_utils_surrogate_key_null_') ``` and with conditional logic, adding separator between fields: diff --git a/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md b/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md index 3037002955b..c6fff54b465 100644 --- a/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md +++ b/website/blog/2021-11-26-welcome-to-the-dbt-developer-blog.md @@ -26,7 +26,7 @@ So let’s all commit to sharing our hard won knowledge with each other—and in The purpose of this blog is to double down on our long running commitment to contributing to the knowledge loop. -From early posts like ‘[The Startup Founders Guide to Analytics’](https://thinkgrowth.org/the-startup-founders-guide-to-analytics-1d2176f20ac1) to foundational guides like [‘How We Structure Our dbt Projects](/blog/how-we-structure-our-dbt-projects)’, we’ve had a long standing goal of working with the community to create practical, hands-on tutorials and guides which distill the knowledge we’ve been able to collectively gather. +From early posts like ‘[The Startup Founders Guide to Analytics’](https://thinkgrowth.org/the-startup-founders-guide-to-analytics-1d2176f20ac1) to foundational guides like [‘How We Structure Our dbt Projects](/guides/best-practices/how-we-structure/1-guide-overview)’, we’ve had a long standing goal of working with the community to create practical, hands-on tutorials and guides which distill the knowledge we’ve been able to collectively gather. dbt as a product is based around the philosophy that even the most complicated problems can be broken down into modular, reusable components, then mixed and matched to create something novel. diff --git a/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md b/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md index 9edcb84fd4f..0a2ec874a22 100644 --- a/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md +++ b/website/blog/2021-11-29-dbt-airflow-spiritual-alignment.md @@ -55,7 +55,7 @@ Airflow solves those same problems, but in a publicly-verifiable and trusted way ### From the dbt side -That pipeline above included a plethora of data transformation jobs, built in various ways. +That pipeline above included a plethora of [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) jobs, built in various ways. They were often written in naked python scripts that only ran a SQL query + wrote data to BigQuery. These stored procedure-like SQL scripts required: @@ -91,7 +91,7 @@ The common skills needed for implementing any flavor of dbt (Core or Cloud) are: * SQL: ‘nuff said * YAML: required to generate config files for [writing tests on data models](/docs/build/tests) -* [Jinja](/guides/getting-started/learning-more/using-jinja): allows you to write DRY code (using [macros](/docs/building-a-dbt-project/jinja-macros), for loops, if statements, etc) +* [Jinja](/guides/advanced/using-jinja): allows you to write DRY code (using [macros](/docs/build/jinja-macros), for loops, if statements, etc) YAML + Jinja can be learned pretty quickly, but SQL is the non-negotiable you’ll need to get started. @@ -103,7 +103,7 @@ To layer on Airflow, you’ll need more software or infrastructure engineering-y Knowing that this toolbelt (Airflow + dbt) provides sustenance to the same spiritual needs (public observability, configuration as code, version control etc), how might one decide when and where to deploy them? -> This is the same sensibility expressed in the [dbt viewpoint](/docs/about/viewpoint) in 2016, the closest thing to a founding blog post as exists for dbt. ] +> This is the same sensibility expressed in the [dbt viewpoint](/community/resources/viewpoint) in 2016, the closest thing to a founding blog post as exists for dbt. ] I usually think in terms of how I want my job to look when things go wrong—am I equipped to do the debugging, and is it clear who to pass the baton to, to fix the issue (if it’s not me)? @@ -111,9 +111,9 @@ A couple examples: ### Pipeline observability for analysts -If your team’s dbt users are analysts rather than engineers, they still may need to be able to dig into the root cause of a failing dbt [source freshness test](/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness). +If your team’s dbt users are analysts rather than engineers, they still may need to be able to dig into the root cause of a failing dbt [source freshness test](/docs/build/sources). -Having your upstream extract + load jobs configured in Airflow means that analysts can pop open the Airflow UI to monitor for issues (as they would a GUI-based [ETL tool](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/)), rather than opening a ticket or bugging an engineer in Slack. The Airflow UI provides the common interface that analysts need to self-serve, up to the point of action needing to be taken. +Having your upstream extract + load jobs configured in Airflow means that analysts can pop open the Airflow UI to monitor for issues (as they would a GUI-based ETL tool), rather than opening a ticket or bugging an engineer in Slack. The Airflow UI provides the common interface that analysts need to self-serve, up to the point of action needing to be taken. ![airflow dashboard](/img/blog/airflow-dbt-dashboard.png "airflow dashboard") @@ -121,7 +121,7 @@ Having your upstream extract + load jobs configured in Airflow means that analys When a dbt run fails within an Airflow pipeline, an engineer monitoring the overall pipeline will likely not have the business context to understand why the individual model or test failed—they were probably not the one who built it. -dbt provides common programmatic interfaces (the [dbt Cloud Admin + Metadata APIs](/docs/dbt-cloud/dbt-cloud-api/cloud-apis), and [.json-based artifacts](/reference/artifacts/dbt-artifacts) in the case of dbt Core) that provide the context needed for the engineer to self-serve—either by rerunning from a point of failure or reaching out to the owner. +dbt provides common programmatic interfaces (the [dbt Cloud Admin + Metadata APIs](/docs/dbt-cloud-apis/overview), and [.json-based artifacts](/reference/artifacts/dbt-artifacts) in the case of dbt Core) that provide the context needed for the engineer to self-serve—either by rerunning from a point of failure or reaching out to the owner. ## Why I ❤️ dbt Cloud + Airflow diff --git a/website/blog/2021-11-29-open-source-community-growth.md b/website/blog/2021-11-29-open-source-community-growth.md index a61fa7ac46b..8a71a504875 100644 --- a/website/blog/2021-11-29-open-source-community-growth.md +++ b/website/blog/2021-11-29-open-source-community-growth.md @@ -46,7 +46,7 @@ Here are the tools I chose to use: - dbt seeds data from offline sources and performs necessary transformations on data after it's been loaded into BigQuery. -- OpenLineage collects data lineage and performance metadata as models run, so I can identify issues and find bottlenecks. Also, to be the subject ecosystem for this study :) +- OpenLineage collects and performance metadata as models run, so I can identify issues and find bottlenecks. Also, to be the subject ecosystem for this study :) - Superset visualizes and analyzes results, creates dashboards, and helps me communicate with stakeholders. diff --git a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md index 7149f5a49b8..c4de04a48c3 100644 --- a/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md +++ b/website/blog/2021-12-05-how-to-build-a-mature-dbt-project-from-scratch.md @@ -21,7 +21,7 @@ One question we hear time and time again is this - what does it look like to pro When Will posed this question on Slack, it got me thinking about what it would take to create a framework for dbt project maturity. -As an analytics engineer on the professional services team at dbt Labs, my teammates and I have had the unique opportunity to work on an unusually high number dbt projects at organizations ranging from tiny startups to Fortune 500 companies and everything in between. From this vantage point, we have gained a unique understanding of the dbt adoption curve - how companies actually implement and expand their usage of dbt. +As an analytics engineer on the professional services team at dbt Labs, my teammates and I have had the unique opportunity to work on an unusually high number of dbt projects at organizations ranging from tiny startups to Fortune 500 companies and everything in between. From this vantage point, we have gained a unique understanding of the dbt adoption curve - how companies actually implement and expand their usage of dbt. With every new engagement, we find ourselves working in a project with a unique mix of data challenges. With the explosion in popularity of dbt, and the constant release of new features and capabilities available in the tool, it’s really easy for data teams to go down the rabbit hole of dbt’s shiniest new features before prioritizing the simple ones that will likely be the most immediately impactful to their organization. @@ -53,7 +53,7 @@ Let’s pretend that we are an analytics engineer at Seeq Wellness, a hypothetic **Key Outcomes** -* Create your first [model](/docs/building-a-dbt-project/building-models) +* Create your first [model](/docs/build/sql-models) * Execute your first [dbt run](/reference/commands/run) @@ -67,9 +67,9 @@ The goal here is to learn the very basics of interacting with a dbt project; fee In addition to learning the basic pieces of dbt, we're familiarizing ourselves with the modern, version-controlled analytics engineering workflow, and experimenting with how it feels to use it at our organization. -If we decide not to do this, we end up missing out on what the dbt workflow has to offer. If you want to learn more about why we think analytics engineering with dbt is the way to go, I encourage you to read the [dbt Viewpoint](/docs/about/viewpoint)! +If we decide not to do this, we end up missing out on what the dbt workflow has to offer. If you want to learn more about why we think analytics engineering with dbt is the way to go, I encourage you to read the [dbt Viewpoint](/community/resources/viewpoint#analytics-is-collaborative)! -In order to learn the basics, we’re going to [port over the SQL file](/docs/get-started/learning-more/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. +In order to learn the basics, we’re going to [port over the SQL file](/guides/migration/tools/refactoring-legacy-sql) that powers our existing "patient_claim_summary" report that we use in our KPI dashboard in parallel to our old transformation process. We’re not ripping out the old plumbing just yet. In doing so, we're going to try dbt on for size and get used to interfacing with a dbt project. **Project Appearance** @@ -83,11 +83,11 @@ The most important thing we’re introducing when your project is an infant is t **Key Outcomes** -* Configure your first [sources](/docs/building-a-dbt-project/using-sources) +* Configure your first [sources](/docs/build/sources) * Introduce modularity with [{{ ref() }}](/reference/dbt-jinja-functions/ref) and [{{ source() }}](/reference/dbt-jinja-functions/source) -* [Document](/docs/building-a-dbt-project/documentation) and [test](/docs/build/tests) your first models +* [Document](/docs/collaborate/documentation) and [test](/docs/build/tests) your first models ![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_3.png) @@ -107,7 +107,7 @@ We’re going to: **Project Appearance** -Let's check in on the growth of [our projec](https://github.com/dbt-labs/dbt-project-maturity/tree/main/2-toddlerhood)t. We've broken some of our logic into its own model — our original script had repetitive logic in subqueries, now it's following a key principle of analytics engineering: Don't Repeat Yourself (DRY). For more information on how to refactor your SQL queries for Modularity - check out our [free on-demand course](https://courses.getdbt.com/courses/refactoring-sql-for-modularity). +Let's check in on the growth of [our project](https://github.com/dbt-labs/dbt-project-maturity/tree/main/2-toddlerhood). We've broken some of our logic into its own model — our original script had repetitive logic in subqueries, now it's following a key principle of analytics engineering: Don't Repeat Yourself (DRY). For more information on how to refactor your SQL queries for Modularity - check out our [free on-demand course](https://courses.getdbt.com/courses/refactoring-sql-for-modularity). We also added our first [YML files](https://circleci.com/blog/what-is-yaml-a-beginner-s-guide/). Here, we have one yml file to [configure our sources](https://github.com/dbt-labs/dbt-project-maturity/blob/main/2-toddlerhood/models/source.yml), and one one yml file to [describe our models](https://github.com/dbt-labs/dbt-project-maturity/blob/main/2-toddlerhood/models/schema.yml). We're just starting with basic declarations of our sources, testing using dbt built in tests, and a model-level description -- these are the first steps of a project just learning to walk! @@ -125,13 +125,15 @@ Leveling up from infant to toddler is a huge jump in terms of feature completene * Create a PR template to ensure quality and consistency -* [Deploy your project](/docs/running-a-dbt-project/running-dbt-in-production)! +* [Deploy your project](/docs/deploy/deployments)! ![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_5.png) **Themes and Goals** -We made a huge jump in our feature completeness in the last stage - now it’s time to think about getting the project ready to be used by multiple developers and even deployed into production. The best way to ensure consistency as we start collaborating is to define standards for how we write code and model data then enforce them in the review process. From the data team's perspective, we shouldn't be able to infer who wrote what line of code because one of our teammates uses the dreaded leading comma. Analytics code is an asset, and should be treated as production grade software. Project Appearance +We made a huge jump in our feature completeness in the last stage - now it’s time to think about getting the project ready to be used by multiple developers and even deployed into production. The best way to ensure consistency as we start collaborating is to define standards for how we write code and model data then enforce them in the review process. From the data team's perspective, we shouldn't be able to infer who wrote what line of code because one of our teammates uses the dreaded leading comma. Analytics code is an asset, and should be treated as production grade software. + +**Project Appearance** We've added project-level documentation to [our repo](https://github.com/dbt-labs/dbt-project-maturity/tree/main/3-childhood) for developers to review as they get started in this project. This generally includes: @@ -141,7 +143,7 @@ We've added project-level documentation to [our repo](https://github.com/dbt-lab 3. A [pull request template](https://github.com/dbt-labs/dbt-project-maturity/blob/main/3-childhood/.github/pull_request_template.md) to make sure we're checking new code against these guidelines every time we want to add new modeling work! -Let's look at our models — we went from a eary stage DAG, starting to get a feel for modularity, to a clean, standardized and logically organized DAG — we can now see logical layers of modeling that correspond the file tree structure we saw before — we can even see the model naming conventions lining up with these layers (stg, int, fct). Defining the standards in how we organize our models in our project level has resulted in a cleaner, easier to understand DAG too! +Let's look at our models — we went from a early stage DAG, starting to get a feel for modularity, to a clean, standardized and logically organized DAG — we can now see logical layers of modeling that correspond to the file tree structure we saw before — we can even see the model naming conventions lining up with these layers (stg, int, fct). Defining the standards in how we organize our models in our project level has resulted in a cleaner, easier to understand DAG too! ![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_6.png) @@ -151,11 +153,11 @@ Even though we haven't changed the function of a lot of our features *codifying **Key Outcomes** -* Leverage code from dbt [packages](/docs/building-a-dbt-project/package-management) +* Leverage code from dbt [packages](/docs/build/packages) * Increase model flexibility and scope of project -* Reduce dbt production build times with [advanced materializations](/docs/building-a-dbt-project/building-models/materializations) +* Reduce dbt production build times with [advanced materializations](/docs/build/materializations) ![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_7.png) @@ -167,7 +169,7 @@ I want to also call out that a "feature" to introduce at this stage is engagemen **Project Appearance** -We can see the major development at [this stage](https://github.com/dbt-labs/dbt-project-maturity/tree/main/4-adolescence) is adding additional models that make our original claims report a lot more flexible -- we had only shown our users a subset of patient and doctor information in our fact model. Now, we have a more Kimball-ish-style marts set up, and we can leave selecting the dimensions up to our BI tool. +We can see the major development at [this stage](https://github.com/dbt-labs/dbt-project-maturity/tree/main/4-adolescence) is adding additional models that make our original claims report a lot more flexible -- we had only shown our users a subset of patient and doctor information in our fact model. Now, we have a more Kimball-ish-style marts setup, and we can leave selecting the dimensions up to our BI tool. ![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_8.png) @@ -183,7 +185,9 @@ We've spent this level focused on deepening and optimizing our feature set — w * Advanced use of metadata -![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_9.png)Themes and Goals +![image alt text](/img/blog/building-a-mature-dbt-project-from-scratch/image_9.png) + +**Themes and Goals** In adulthood, we're turning our gaze even further inward. Our dbt project itself is independent enough to start asking itself the big questions! What does it mean to be a dbt project in the year 2021? How have I been changing? How am I relating to my peers? diff --git a/website/blog/2022-01-12-time-on-task-calculation.md b/website/blog/2022-01-12-time-on-task-calculation.md index d4b3dac0666..8b2c4867244 100644 --- a/website/blog/2022-01-12-time-on-task-calculation.md +++ b/website/blog/2022-01-12-time-on-task-calculation.md @@ -35,7 +35,7 @@ This piece will provide an overview of how and critically *why* to calculate Tim 1. One size fits all with nested macros -This solution allowed us to create a one-line [dbt macro](/docs/building-a-dbt-project/jinja-macros#macros) to account for most common Time On Task use cases by having a series of nested macros behind the scenes. +This solution allowed us to create a one-line [dbt macro](/docs/build/jinja-macros) to account for most common Time On Task use cases by having a series of nested macros behind the scenes. This strategy does a great job in being able to account for nights, weekends and custom holidays, but lacks the flexibility to accommodate changes in business hours, so we've transitioned off of it to the 2nd option: @@ -94,7 +94,7 @@ So for any timestamp that is already in business hours like the above example, t ### What about holidays? -We maintain a [seed file](/docs/building-a-dbt-project/seeds) in our project that has the dates of holidays for the next 5 years or so - we join this to our hour-level date_dim table, and incorporate holidays into the boolean column mentioned above. This way, any ticket that comes in on a holiday gets fast forwarded to the beginning of the next working day. Not a perfect solution, so curious to hear how this is handled elsewhere! +We maintain a [seed file](/docs/build/seeds) in our project that has the dates of holidays for the next 5 years or so - we join this to our hour-level date_dim table, and incorporate holidays into the boolean column mentioned above. This way, any ticket that comes in on a holiday gets fast forwarded to the beginning of the next working day. Not a perfect solution, so curious to hear how this is handled elsewhere! ## The customizable option: a bespoke calendar + subquery diff --git a/website/blog/2022-02-07-customer-360-view-census-playbook.md b/website/blog/2022-02-07-customer-360-view-census-playbook.md index b3681edc1cd..01bea4b09c5 100644 --- a/website/blog/2022-02-07-customer-360-view-census-playbook.md +++ b/website/blog/2022-02-07-customer-360-view-census-playbook.md @@ -14,7 +14,7 @@ is_featured: true *Editor's note: In this tutorial, Donny walks through the fictional story of a SaaS company called JaffleGaggle, who needs to group their freemium individual users into company accounts (aka a customer 360 view) in order to drive their product-led growth efforts.* -*You can follow along with Donny's data modeling technique for identity resolution in [this dbt project repo](https://github.com/dflynn20/jaffle_gaggle). It includes a set of demo CSV files, which you can use as [dbt seeds](https://docs.getdbt.com/docs/building-a-dbt-project/seeds) to test Donny's project for yourself.* +*You can follow along with Donny's data modeling technique for identity resolution in [this dbt project repo](https://github.com/dflynn20/jaffle_gaggle). It includes a set of demo CSV files, which you can use as [dbt seeds](https://docs.getdbt.com/docs/build/seeds) to test Donny's project for yourself.* @@ -30,7 +30,7 @@ In short, a jaffle is: *See above: Tasty, tasty jaffles.* -Jaffle Shop is a demo repo referenced in [dbt’s Getting Started Guide](/docs/get-started/getting-started/overview), and its jaffles hold a special place in the dbt community’s hearts, as well as on Data Twitter™. +Jaffle Shop is a demo repo referenced in [dbt’s Getting Started Guide](/quickstarts), and its jaffles hold a special place in the dbt community’s hearts, as well as on Data Twitter™. ![jaffles on data twitter](/img/blog/2022-02-08-customer-360-view/image_1.png) @@ -89,7 +89,7 @@ The data structure breaks down as follows: Let’s get rolling. -> Builder Beware! If this was an actual event stream, it would be much better to leverage [incremental models based on timestamp](/docs/building-a-dbt-project/building-models/configuring-incremental-models), but because it’s a playground project, I did not. +> Builder Beware! If this was an actual event stream, it would be much better to leverage [incremental models based on timestamp](/docs/build/incremental-models), but because it’s a playground project, I did not. ## Step 1: Define our entities @@ -141,7 +141,7 @@ For this step, take a look at a snippet from [`models/staging/stg_users.sql`](ht from source ``` -We defined the email domain extraction as a [macro](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) called [`extract_email_domain`](https://github.com/dflynn20/jaffle_gaggle/blob/main/macros/extract_email_domain.sql), which we call in line 18 (which you can find in the pullout below). +We defined the email domain extraction as a [macro](/docs/build/jinja-macros) called [`extract_email_domain`](https://github.com/dflynn20/jaffle_gaggle/blob/main/macros/extract_email_domain.sql), which we call in line 18 (which you can find in the pullout below). This uses a regex to capture the text to the right of the ‘@’ character and makes sure to only use the lowercase email parameter before extracting the domain. This is because email domains aren’t case sensitive, but SQL is (see users 2954 and 3140 in the [seed data](https://github.com/dflynn20/jaffle_gaggle/blob/main/data/raw_user.csv) for an example). diff --git a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md index a6e80f9ccdd..89fcb6f5890 100644 --- a/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md +++ b/website/blog/2022-02-23-founding-an-AE-team-smartsheet.md @@ -148,7 +148,7 @@ For example, we originally assumed the data flow from Raw into its Prod state wo To get through the fog of questions, we chose one principle on which to plant our flag: **Data must be in sync and always accessible across the database**. In our existing database, if you queried a table mid-update: too bad. You got weird data and might not even know it, or your query would fail. We wanted to do better and our cloud data warehouse provided the platform we needed to make it happen. -We settled on the following data transformation flow. Our transformation code is pulled every 8 hours onto a virtual machine (VM). A script on that VM triggers dbt to run that code, populating a staging database which is visible only to my team. Staging continues to update table by table until the run is successful.  When successful, Staging immediately clones over to Prod, with no downtime for users even if they are mid-query.  Everyone is happy. +We settled on the following [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) flow. Our transformation code is pulled every 8 hours onto a virtual machine (VM). A script on that VM triggers dbt to run that code, populating a staging database which is visible only to my team. Staging continues to update table by table until the run is successful.  When successful, Staging immediately clones over to Prod, with no downtime for users even if they are mid-query.  Everyone is happy. ![Graphic depicting Smartsheet's finalized data transformation workflow structure](/img/blog/2022-02-23-founding-an-AE-team-smartsheet/new-transformation-workflow.png) diff --git a/website/blog/2022-04-05-when-backend-devs-spark-joy.md b/website/blog/2022-04-05-when-backend-devs-spark-joy.md index 3b87d3a7d6c..646785675e8 100644 --- a/website/blog/2022-04-05-when-backend-devs-spark-joy.md +++ b/website/blog/2022-04-05-when-backend-devs-spark-joy.md @@ -65,7 +65,7 @@ Analytics engineers can catch release changes that may break their models prior **Don’t forget the documentation.** -Regularly maintained and well-written documentation for backend application database tables helps analytics engineers and backend developers alike unpack complex data and data models. Documentation for backend application database tables might look like an entity relationship diagram (ERD) or an ERD supplemented with a living text-document providing greater detail into tables and fields. Furthermore, strong documentation helps analytics engineers write more descriptive [documentation for source models in dbt](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources#testing-and-documenting-sources). +Regularly maintained and well-written documentation for backend application database tables helps analytics engineers and backend developers alike unpack complex data and data models. Documentation for backend application database tables might look like an entity relationship diagram (ERD) or an ERD supplemented with a living text-document providing greater detail into tables and fields. Furthermore, strong documentation helps analytics engineers write more descriptive [documentation for source models in dbt](/docs/build/sources#testing-and-documenting-sources). ## A match made in heaven diff --git a/website/blog/2022-04-14-add-ci-cd-to-bitbucket.md b/website/blog/2022-04-14-add-ci-cd-to-bitbucket.md index 6cdb20d0df3..451013b1572 100644 --- a/website/blog/2022-04-14-add-ci-cd-to-bitbucket.md +++ b/website/blog/2022-04-14-add-ci-cd-to-bitbucket.md @@ -300,7 +300,7 @@ You’re all done! Now it’s time to test that things work: ## Conclusion -It’s important to remember that CI/CD is a convenience, not a panacea. You must still devise the model logic and determine the appropriate tests. Some things it can do, though: catch more mistakes early, make sure that the database always reflects the most up-to-date code, and decrease the friction in collaboration. By automating the steps that should *always* be taken, it frees you up to think about the unusual steps required (e.g., do your changes to [incremental models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models) require an additional deployment with `--full-refresh`?) and reduces the amount of review that others’ actions necessitate. +It’s important to remember that CI/CD is a convenience, not a panacea. You must still devise the model logic and determine the appropriate tests. Some things it can do, though: catch more mistakes early, make sure that the database always reflects the most up-to-date code, and decrease the friction in collaboration. By automating the steps that should *always* be taken, it frees you up to think about the unusual steps required (e.g., do your changes to [incremental models](https://docs.getdbt.com/docs/build/incremental-models) require an additional deployment with `--full-refresh`?) and reduces the amount of review that others’ actions necessitate. Plus, it’s a good time, and it’s fun to watch the test lights turn green. Ding! diff --git a/website/blog/2022-04-19-complex-deduplication.md b/website/blog/2022-04-19-complex-deduplication.md index 613b63c04b8..daacff4eec6 100644 --- a/website/blog/2022-04-19-complex-deduplication.md +++ b/website/blog/2022-04-19-complex-deduplication.md @@ -68,37 +68,31 @@ Here’s a brief overview of the steps we’ll take: > Step 1 walks you through how to build a hashed entity id from column values using a macro. You’ll use this key in Step 2 to find the true duplicates and clean them out. -The idea in this step is to enable checking for duplicates in the data by attaching a unique key to the hashed values of the columns that make up the entity grain you want to track. It’s important to note here that the *[dbt_utils.surrogate_key](https://github.com/dbt-labs/dbt-utils/blob/0.8.2/macros/sql/surrogate_key.sql)* will not create a unique key yet! Instead, it will create a key that will be the same as the key of another row, as long as the column values we’ve selected for our entity grain are the same. *This is intentional and critical!*  The specific non-uniqueness is how we’ll catch our sneaky duplicates. +The idea in this step is to enable checking for duplicates in the data by attaching a unique key to the hashed values of the columns that make up the entity grain you want to track. It’s important to note here that the *[dbt_utils.generate_surrogate_key](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql)* will not create a unique key yet! Instead, it will create a key that will be the same as the key of another row, as long as the column values we’ve selected for our entity grain are the same. *This is intentional and critical!*  The specific non-uniqueness is how we’ll catch our sneaky duplicates. In our example, you can see that the `surrogate_key` function builds the same `grain_id` or the two rows we know are duplicates, rows 2 and 3, with row 3 being the most recent row. | grain_id | entity_grain | entity_id | unimportant_value | important_status | updated_at_date | |----------------------------------|--------------|-----------|-------------------|------------------|-----------------| | 8e0bd4a0e4a6e3a4ad3f28f13a3d5e51 | 1_pending | 1 | cool | pending | 2022-02-24 | -| c8b91b84808caaf5870d707866b59c | 1_submitted | 1 | lame | submitted | 2022-03-01 | +| c8b91b84808caaf5870d707866b59c | 1_submitted | 1 | boring | submitted | 2022-03-01 | | c8b91b84808caaf5870d707866b59c | 1_submitted | 1 | cool | submitted | 2022-03-03 | | 283ff22afb622dcc6a7da373ae1a0fb | 2_pending | 2 | cool | pending | 2022-02-27 | Remember, it’s important to only look for duplicate rows for the values that indicate a *true* difference between the rows of data the data; e.g., in type-two data, `updated_at_date` doesn’t mean that the other columns that we’ve decided we’re concerned with have changed since the previous time it was loaded, so that column doesn’t necessarily indicate a true difference between rows (though it usually indicates that something has changed, but that change may be outside our scope of concern in this case). But a change in `important_status`, for our purposes, would indicate a change in the data that you’d probably want to track. If you aren’t applying this technique to type-two data, but instead wanting to remove everything except the most recent data, you may have just a few columns that indicate a true difference between rows (an id at the right grain, and/or an id at a larger grain + timestamp). -To build our `grain_id` key, we use the pure gold of the *[dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/0.8.0/)*. If you’re unsure of what this package is, stop reading right now and make sure this is installed in your dbt project. It will bring joy to your life and ease to your struggling! +To build our `grain_id` key, we use the pure gold of the *[dbt_utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/)*. If you’re unsure of what this package is, stop reading right now and make sure this is installed in your dbt project. It will bring joy to your life and ease to your struggling! -`dbt_utils.star` is the *star* [Editor’s note: 🤦‍♀️] of the show here, which allows you to grab all the columns, *except* the ones you list. If you only have a couple columns, it may be easier just to list them for the `cols` variable instead of using the `star` function. +`dbt_utils.get_filtered_columns_in_relation` is the star of the show here, which allows you to grab all the columns from a [relation](/reference/dbt-classes#relation) (reference/source), *except* the ones you specify, and put them into a list. If you only have a couple columns, it may be easier just to list them for the `cols` variable instead of using the this function. ```sql -{% macro build_key_from_columns(table_name, exclude=[]) %} +{%- macro build_key_from_columns(dbt_relation, exclude=[]) -%} -{% set cols = {{ dbt_utils.star(from=ref('table_name'), except = exclude) }} %} - -{%- for col in cols -%} +{% set cols = dbt_utils.get_filtered_columns_in_relation(dbt_relation, exclude) %} - {%- do col_list.append("coalesce(cast(" ~ col.column ~ " as " ~ dbt_utils.type_string() ~ "), '')")  -%} +{{ return(dbt_utils.surrogate_key(cols)) }} -{%- endfor -%} - -{{ return(dbt_utils.surrogate_key(col_list)) }} - -{% endmacro %} +{%- endmacro -%} ``` For each row of data, this macro grabs each value from all the columns, except the columns we specify in the exclude list. Then it creates a hash-key using `dbt_utils.surrogate_key` that will reflect the uniqueness of the column values (i.e. if the combination of values is *not* unique, the `surrogate_key` will be the same, which is what we want to capture). The columns in the exclude list are values that we want to ignore when looking for a change in the data table (like `unimportant_value,`a column whose fluctuations we don’t want to indicate a real difference between rows). Call the macro above to create a column in your base or staging layer, and call it `grain_id`, so we can filter out the changes where `count(grain_id) > 1`: diff --git a/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md b/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md index 47da8cbe2ff..91ad1080ce6 100644 --- a/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md +++ b/website/blog/2022-05-03-making-dbt-cloud-api-calls-using-dbt-cloud-cli.md @@ -44,7 +44,7 @@ curl -H "Authorization:Token $DBT_CLOUD_API_TOKEN" -H "Content-Type:application/ -```js +``` dbt-cloud job run --job-id 43167 ``` @@ -71,7 +71,7 @@ I modified the script according to our needs and wrapped it in a `dbt-cloud job Now we had exactly what we wanted and our CI workflow in GitHub actions looked slick: -```js +``` - name: Trigger dbt Cloud job run run: | ./cool_script_bro.sh @@ -109,15 +109,15 @@ After the initial release I started to expand to cover the rest of the dbt Cloud In this example we’ll download a `catalog.json` artifact from the latest run of a dbt Cloud job using `dbt-cloud run list` and `dbt-cloud get-artifact` and then write a simple Data Catalog CLI application using the same tools that are used in `dbt-cloud-cli` (i.e., `click` and `pydantic`). Let’s dive right in! -The first command we need is the `dbt-cloud run list` which uses an [API V4 endpoint](https://docs.getdbt.com/dbt-cloud/api-v4#operation/list-account-runs) that returns runs sorted by creation date, with the most recent run appearing first. The command returns a JSON response that has one top-level attribute `data` that contains a list of runs. We’ll need to extract the `id` attribute of the first one and to do that we use [jq](https://stedolan.github.io/jq/): +The first command we need is the `dbt-cloud run list` which uses an [API endpoint](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#/operations/List%20Runs) that returns runs sorted by creation date, with the most recent run appearing first. The command returns a JSON response that has one top-level attribute `data` that contains a list of runs. We’ll need to extract the `id` attribute of the first one and to do that we use [jq](https://stedolan.github.io/jq/): -```js +``` latest_run_id=$(dbt-cloud run list --job-id $DBT_CLOUD_JOB_ID | jq .data[0].id -r) ``` Next, we use the `dbt-cloud get-artifact` command to download the `catalog.json` artifact: -```js +``` dbt-cloud run get-artifact --run-id $latest_run_id --path catalog.json -f catalog.json ``` diff --git a/website/blog/2022-05-24-joining-snapshot-complexity.md b/website/blog/2022-05-24-joining-snapshot-complexity.md index 2d7065d76fc..5b0d0cdf655 100644 --- a/website/blog/2022-05-24-joining-snapshot-complexity.md +++ b/website/blog/2022-05-24-joining-snapshot-complexity.md @@ -82,7 +82,7 @@ This boils down to the following steps: 1. Get rid of dupes if needed 2. Snapshot your data tables 3. Future-proof your `valid_to` dates -4. Join all your tables to build a fanned out spine containing the grain ids onto which we will join the rest of the data +4. Join your non-matching grain tables to build a fanned out spine containing the grain ids onto which we will join the rest of the data 5. Join the snapshots to the data spine on the appropriate id in overlapping timespans, narrowing the valid timespans per row as more tables are joined 6. Clean up your columns in final 7. Optional addition of global variable to filter to current values only @@ -152,15 +152,15 @@ coalesce(dbt_valid_to, cast('{{ var("future_proof_date") }}' as timestamp)) as v You will thank yourself later for building in a global variable. Adding important global variables will set your future-self up for success. Now, you can filter all your data to the current state by just filtering on `where valid_to = future_proof_date`*.* You can also ensure that all the data-bears with their data-paws in the data-honey jar are referencing the **same** `future_proof_date`, rather than `9998-12-31`, or `9999-12-31`, or `10000-01-01`, which will inevitably break something eventually. You know it will; don’t argue with me! Global vars for the win! -## Step 4: Join all your tables together to build a fanned out id spine +## Step 4: Join your tables together to build a fanned out id spine :::important What happens in this step? -Step 4 walks you through how to do your first join, in which you need to fan out the data spine to the finest grain possible and to include all the id onto which we will join the rest of the data. This step is crucial to joining the snapshots in subsequent steps. +Step 4 walks you through how to do your first join, in which you need to fan out the data spine to the finest grain possible and to include the id onto which we will join the rest of the data. This step is crucial to joining the snapshots in subsequent steps. ::: -Let’s look at how we’d do this with an example. You may have many events associated with a single `product_id`. Each `product_id` may have several `order_ids`, and each `order_id` may have another id associated with it. Which means that the grain of each table needs to be identified. The point here is that we need to build in an id at the finest grain. To do so, we’ll add in a [dbt_utils.surrogate_key](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/surrogate_key.sql) in the staging models that live on top of the snapshot tables. +Let’s look at how we’d do this with an example. You may have many events associated with a single `product_id`. Each `product_id` may have several `order_ids`, and each `order_id` may have another id associated with it. Which means that the grain of each table needs to be identified. The point here is that we need to build in an id at the finest grain. To do so, we’ll add in a [dbt_utils.generate_surrogate_key](https://github.com/dbt-labs/dbt-utils/blob/main/macros/sql/generate_surrogate_key.sql) in the staging models that live on top of the snapshot tables. -Then, in your joining model, let’s add a CTE to build out our spine with all of our ids. +Then, in your joining model, let’s add a CTE to build out our spine with our ids of these different grains. ```sql build_spine as ( @@ -178,7 +178,7 @@ left join ... ) ``` -The result will be all the columns from your first table, fanned out as much as possible by the added id columns. We will use these id columns to join the historical data from our tables. +The result will be all the columns from your first table, fanned out as much as possible by the added `id` columns. We will use these `id` columns to join the historical data from our tables. It is extremely important to note that if you have tables as part of this pattern that are captured at the same grain as the original table, you **do not** want to join in that table and id as part of the spine. It will fan-out _too much_ and cause duplicates in your data. Instead, simply join the tables with the same grain as the original table (in this case, `historical_table_1` on `product_id`) in the next step, using the macro. | product_id | important_status | dbt_valid_from | dbt_valid_to | product_order_id | | --- | --- | --- | --- | --- | @@ -225,16 +225,14 @@ Your parameters are `cte_join`, the table that is creating the spine of your fin from {{cte_join}} left join {{cte_join_on}} on {{cte_join}}.{{cte_join_id}} = {{cte_join_on}}.{{cte_join_on_id}} - and (({{cte_join_on}}.{{cte_join_on_valid_from}} >= {{cte_join}}.{{cte_join_valid_from}} - and {{cte_join_on}}.{{cte_join_on_valid_from}} < {{cte_join}}.{{cte_join_valid_to}}) - or ({{cte_join_on}}.{{cte_join_on_valid_to}} >= {{cte_join}}.{{cte_join_valid_from}} - and {{cte_join_on}}.{{cte_join_on_valid_to}} < {{cte_join}}.{{cte_join_valid_to}})) + and ({{cte_join_on}}.{{cte_join_on_valid_from}} <= {{cte_join}}.{{cte_join_valid_to}} + and {{cte_join_on}}.{{cte_join_on_valid_to}} >= {{cte_join}}.{{cte_join_valid_from}}) {% endmacro %} ``` -The joining logic finds where the ids match and where the timestamps overlap between the two tables. We use the **greatest** `valid_from` and the **least** `valid_to` between the two tables to ensure that the new, narrowed timespan for the row is when the rows from both tables are valid. +The joining logic finds where the ids match and where the timestamps overlap between the two tables. We use the **greatest** `valid_from` and the **least** `valid_to` between the two tables to ensure that the new, narrowed timespan for the row is when the rows from both tables are valid. _**Update: Special thank you to Allyn Opitz for simplifying this join logic! It's so much prettier now.**_ You should see something like this as your end result: diff --git a/website/blog/2022-06-30-lower-sql-function.md b/website/blog/2022-06-30-lower-sql-function.md index 353b11376b0..c50af5f3fb3 100644 --- a/website/blog/2022-06-30-lower-sql-function.md +++ b/website/blog/2022-06-30-lower-sql-function.md @@ -2,6 +2,7 @@ title: "LOWER SQL function: Why we love it" description: "The LOWER SQL Function allows you to return a string value as an all lowercase string. It’s an effective way to create consistent capitalization for string values across your data." slug: lower-sql-love-letter +canonical_url: https://docs.getdbt.com/sql-reference/lower authors: [kira_furuichi] diff --git a/website/blog/2022-07-05-date-trunc-sql-love-letter.md b/website/blog/2022-07-05-date-trunc-sql-love-letter.md index 99f658590a6..916f5b505a3 100644 --- a/website/blog/2022-07-05-date-trunc-sql-love-letter.md +++ b/website/blog/2022-07-05-date-trunc-sql-love-letter.md @@ -2,6 +2,7 @@ title: "DATE_TRUNC SQL function: Why we love it" description: "The DATE_TRUNC function will truncate a date or time to the first instance for a given date part maintaining a date format. Wordy, wordy, wordy! What does this really mean?" slug: date-trunc-sql +canonical_url: https://docs.getdbt.com/sql-reference/date-trunc authors: [kira_furuichi] diff --git a/website/blog/2022-07-05-datediff-sql-love-letter.md b/website/blog/2022-07-05-datediff-sql-love-letter.md index 5661dde7a9e..27ab84d4909 100644 --- a/website/blog/2022-07-05-datediff-sql-love-letter.md +++ b/website/blog/2022-07-05-datediff-sql-love-letter.md @@ -2,6 +2,7 @@ title: "DATEDIFF SQL function: Why we love it" description: "The DATEDIFF function will return the difference in specified units (ex. days, weeks, years) between a start date/time and an end date/time. It’s a simple and widely used function that you’ll find yourself using more often than you expect." slug: datediff-sql-love-letter +canonical_url: https://docs.getdbt.com/sql-reference/datediff authors: [kira_furuichi] @@ -68,7 +69,7 @@ Three minor differences in the implementation here: You may be able to memorize the syntax for the DATEDIFF function for the primary data warehouse you use. What happens when you switch to a different one for a new job or a new data stack? Remembering if there’s an underscore in the function name or which argument the `` is passed in as is… no fun and leads to the inevitable, countless “datediff in bigquery” Google searches. -Luckily, [dbt-core](https://github.com/dbt-labs/dbt-core) has your back! dbt Core is the open source dbt product that helps data folks write their data transformations following software engineering best practices. +Luckily, [dbt-core](https://github.com/dbt-labs/dbt-core) has your back! dbt Core is the open source dbt product that helps data folks write their [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) following software engineering best practices. With dbt v1.2, [adapters](https://docs.getdbt.com/docs/supported-data-platforms) now support [cross-database macros](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros) to help you write certain functions, like [DATE_TRUNC](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#date_trunc) and [DATEDIFF](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#datediff), without having to memorize sticky function syntax. diff --git a/website/blog/2022-07-12-change-data-capture.md b/website/blog/2022-07-12-change-data-capture.md index b765cec9c95..7e26c7375a8 100644 --- a/website/blog/2022-07-12-change-data-capture.md +++ b/website/blog/2022-07-12-change-data-capture.md @@ -190,7 +190,7 @@ Below, you’ll find two solutions that are more effective than snapshotting a f ## Solution #1: Downstream Incremental Model -Instead of using snapshots, Joanne could create an [incremental model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models) downstream of `fct_income` to “grab” every point-in-time version of `fct_income` – let’s call this incremental model `int_income_history` and assume it has the following config block: +Instead of using snapshots, Joanne could create an [incremental model](https://docs.getdbt.com/docs/build/incremental-models) downstream of `fct_income` to “grab” every point-in-time version of `fct_income` – let’s call this incremental model `int_income_history` and assume it has the following config block: ```sql {{ diff --git a/website/blog/2022-07-13-star-sql-love-letter.md b/website/blog/2022-07-13-star-sql-love-letter.md index 87469dc2730..a84750198de 100644 --- a/website/blog/2022-07-13-star-sql-love-letter.md +++ b/website/blog/2022-07-13-star-sql-love-letter.md @@ -26,13 +26,13 @@ from {{ ref('table_a') }} At this point, you realize your will to continue typing out the next 52 columns has essentially dwindled down to nothing and you’re probably questioning the life choices that led you here. -But what if there was a way to make these 56+ lines of code come down to a handful? Well, that’s where a handy [dbt macro](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) comes into play. +But what if there was a way to make these 56+ lines of code come down to a handful? Well, that’s where a handy [dbt macro](/docs/build/jinja-macros) comes into play. ## The `star` dbt macro -dbt supports [dbt_utils](https://github.com/dbt-labs/dbt-utils), a [package of macros and tests](https://docs.getdbt.com/docs/building-a-dbt-project/package-management) that data folks can use to help them write more code in their dbt project. One of the macros dbt utils offers is the `star` generator. +dbt supports [dbt_utils](https://github.com/dbt-labs/dbt-utils), a [package of macros and tests](https://docs.getdbt.com/docs/build/packages) that data folks can use to help them write more code in their dbt project. One of the macros dbt utils offers is the `star` generator. This macro: diff --git a/website/blog/2022-07-19-migrating-from-stored-procs.md b/website/blog/2022-07-19-migrating-from-stored-procs.md index 2140bc24dc1..691284a49e9 100644 --- a/website/blog/2022-07-19-migrating-from-stored-procs.md +++ b/website/blog/2022-07-19-migrating-from-stored-procs.md @@ -26,7 +26,7 @@ It’s worth considering if an alternative approach with dbt might help. ## Why use modular dbt models instead of stored procedures? -We work with many analytics teams to refactor their stored procedure code into dbt. Many of them come in thinking that the upfront effort to modernize their approach to data transformation will be too much to justify. However, we see that in the long term this isn’t the case. +We work with many analytics teams to refactor their stored procedure code into dbt. Many of them come in thinking that the upfront effort to modernize their approach to [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) will be too much to justify. However, we see that in the long term this isn’t the case. For example, a dbt Cloud user achieved the following results when moving away from the stored procedure approach: @@ -79,10 +79,10 @@ In general, we've found that the recipe presented below is an effective conversi 1. Map data flows in the stored procedure 2. Identify raw source data 3. Create a staging layer on top of raw sources for initial data transformations such as data type casting, renaming, etc. -4. Replace hard-coded table references with dbt [source()](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources) and [ref()](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) statements. This enables 1) ensuring things are run in the right order and 2) automatic documentation! +4. Replace hard-coded table references with dbt [source()](/docs/build/sources) and [ref()](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) statements. This enables 1) ensuring things are run in the right order and 2) automatic documentation! 5. Map INSERTS and UPDATES in the stored procedure to SELECT in dbt models 6. Map DELETES in the stored procedure to WHERE filters in dbt models -7. If necessary, use [variables](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/using-variables) in dbt to dynamically assign values at runtime, similar to arguments passed to a stored procedure. +7. If necessary, use [variables](/docs/build/project-variables) in dbt to dynamically assign values at runtime, similar to arguments passed to a stored procedure. 8. Iterate on your process to refine the dbt [DAG](https://docs.getdbt.com/docs/introduction#what-makes-dbt-so-powerful) further. You could continue optimizing forever, but typically we find a good stopping point when the outputs from the stored procedure and final dbt models are at parity. Sometimes, we find ourselves confronted with code that’s so complex, the end user isn’t able to understand exactly what it’s doing. In these cases, it may not be possible to perform an apples-to-apples mapping of the process embedded in the original stored procedure, and it’s actually more efficient to scrap the whole thing and focus on working backwards to reproduce the desired output in dbt. Note the section on auditing results below as a key success driver in this situation. diff --git a/website/blog/2022-07-26-pre-commit-dbt.md b/website/blog/2022-07-26-pre-commit-dbt.md index 3873e46e3ae..e0b41d82d0c 100644 --- a/website/blog/2022-07-26-pre-commit-dbt.md +++ b/website/blog/2022-07-26-pre-commit-dbt.md @@ -10,6 +10,8 @@ date: 2022-08-03 is_featured: true --- +*Editor's note — since the creation of this post, the package pre-commit-dbt's ownership has moved to another team and it has been renamed to [dbt-checkpoint](https://github.com/dbt-checkpoint/dbt-checkpoint). A redirect has been set up, meaning that the code example below will still work. It is also possible to replace `repo: https://github.com/offbi/pre-commit-dbt` with `repo: https://github.com/dbt-checkpoint/dbt-checkpoint` in your `.pre-commit-config.yaml` file.* + At dbt Labs, we have [best practices](https://docs.getdbt.com/docs/guides/best-practices) we like to follow for the development of dbt projects. One of them, for example, is that all models should have at least `unique` and `not_null` tests on their primary key. But how can we enforce rules like this? That question becomes difficult to answer in large dbt projects. Developers might not follow the same conventions. They might not be aware of past decisions, and reviewing pull requests in git can become more complex. When dbt projects have hundreds of models, it's hard to know which models do not have any tests defined and aren't enforcing your conventions. @@ -91,7 +93,7 @@ A normal next step after installing pre-commit is to run a `pre-commit install` Instead, we can do a `pre-commit run --all-files`, which will run all the tests defined in our configuration file on all the files in our dbt project. -[![Animation showing the output in the Terminal after running the above commands](https://asciinema.org/a/lTmefht77ZEr6kmP7DymaxjRF.svg)](https://asciinema.org/a/lTmefht77ZEr6kmP7DymaxjRF) +![Animation showing the output in the Terminal after running the above commands](/img/blog/2022-07-26-pre-commit-dbt/pre-commit-run-all-files.gif) In my case, I can see that my model called `customers.sql` has not been added to any YAML file and has no test defined. @@ -165,7 +167,7 @@ jobs: steps: - name: Checkout branch - uses: actions/checkout@v2 + uses: actions/checkout@v3 # Using bash and pip to install dbt and pre-commit # Update the dbt installation command to include the adapter you need diff --git a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md b/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md deleted file mode 100644 index 3db07f6f26a..00000000000 --- a/website/blog/2022-07-27-understanding-the-components-of-the-dbt-semantic-layer.md +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: "Understanding the components of the dbt Semantic Layer" -description: "Heard about dbt Metrics or the dbt Semantic Layer and curious to give them a try? Callum McCann digs into what they are, walks through an example, and discusses how they all fit together!" -slug: understanding-the-components-of-the-dbt-semantic-layer - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-07-27 -is_featured: true ---- - -# Getting started with the dbt Semantic Layer - -> TLDR: The Semantic Layer is made up of a combination of open-source and SaaS offerings and is going to change how your team defines and consumes metrics. - -At last year's Coalesce, Drew showed us the future[^1] - a vision of what metrics in dbt could look like. Since then, we've been getting the infrastructure in place to make that vision a reality. We wanted to share with you where we are today and how it fits into the broader picture of [where we're going](https://www.getdbt.com/blog/dbt-semantic-layer). - -To those who haven't followed this saga with the intensity of [someone watching their investments on the crypto market](https://mobile.twitter.com/scannergr1/status/1536198701215109122/photo/1), we're rolling out this new resource to help you better understand the dbt Semantic Layer and provide clarification on the following things: - -1. What is the dbt Semantic Layer? -2. How do I use it? -3. What is publicly available now? -4. What is still in development? - -With that, lets get into it! - - - -> Some of you might have been around when this was initially being referred to as the Metrics Layer. As we evaluated the long term plans for what this part of dbt was going to become, we realized that naming it the Semantic Layer better reflected its capabilities and where we plan on taking it. - -## What is the dbt Semantic Layer? - -The dbt Semantic Layer is a new part of dbt to help improve precision and consistency while expanding flexibility and capability in the modern data stack. Our maestro of metrics, Drew Banin, [released a blog post detailing the vision of where we're going here](https://www.getdbt.com/blog/dbt-semantic-layer). The first use case that we are addressing is one that most practicioners **and** stakeholders are familiar with - metrics. We'll walk through what this looks like in practice later on in this post. - -Under the hood, the dbt Semantic layer is collection of several components - some of these are part of dbt Core, some part of dbt Cloud, and some are net new functionality. They all [combine together like Voltron](https://www.youtube.com/watch?v=5rPSLQxMT8w) to create a single experience through which business users can query data in the context of the metric that is most familiar to them. And the best part is that they can do it in systems they are already comfortable using. - -***What will this look like for my data consumers and business stakeholders?*** - -Ultimately, this looks like people being able to interact with trusted datasets in the tools that they are comfortable with (and eventually new tools designed specifically around metrics). - -An example that we’ve found helpful is [ARR](https://www.zuora.com/billing-topics/annual-recurring-revenue/#:~:text=Annual%20Recurring%20Revenue%2C%20or%20ARR,for%20a%20single%20calendar%20year). A business-critical metric to SaaS companies, ARR can be a tricky calculation to keep consistent across all of the tools used in the business. With the dbt Semantic Layer, this definition would live in dbt and the logic to create the dataset for that metric would be consistent across all different consuming experiences. Best of all, definition changes would get reflected in downstream tools, so you no longer need to manually search and update every downstream dependency. Callum of 3 years ago is jumping with joy. - -***That’s good and all, but what does this look like for practitioners to use?*** - -The dbt Semantic layer is comprised of the following components[^2]: - -**Available Today** - -- **[`metric` node in dbt Core :](https://docs.getdbt.com/docs/building-a-dbt-project/metrics)** Similar to `models` or `sources` , this is a specific node type in dbt Core. It is the definition of a time-series aggregation over a table that supports zero or more dimensions. The resulting node is stored in the `manifest.json` just like `models` and referenced in the DAG. -- **[`dbt_metrics` package:](https://github.com/dbt-labs/dbt_metrics)** this package provides macros that combine the version-controlled metric definition and query-time parameters (like dimensions, a time grain, and secondary calculations) to generate a SQL query which calculates the metric value. -- **[dbt Cloud Metadata API](https://docs.getdbt.com/docs/dbt-cloud-apis/metadata-api):** a GraphQL API which supports arbitrary queries over the metadata produced by dbt Cloud jobs. Contains metadata related to the accuracy, recency, configuration, and structure of the views and tables in the warehouse, as well as much more. - -**New** - -- **dbt Server:** this component wraps dbt Core in a persistent server that is responsible for handling RESTful API requests for dbt operations. It’s a thin interface that is primarily responsible for performance and reliability in production environments. -- **dbt Cloud proxy server:** this component enables dbt Cloud to dynamically rewrite requests to a data warehouse and compile dbt-SQL into raw SQL that the database understands. It then returns the dataset produced by the raw SQL to the platform that sent it. - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/semantic-layer-description.png) - -### Understanding how and when to use metrics? - -> Use of metrics and the metrics package is recommended for experienced dbt users and early adopters who want to explore this functionality. - -Let's walk through an example of how you can use the components above to get started today using our old friend - [the Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics). We'll take a look at how you can start defining and testing metrics today as well as how you'll interact with them once the new components are released. - -**When to use Metrics** - -The first question you need to ask is, *Should we be using metrics?* - -It is our belief that metrics are not a one-size fits all solution. They are designed for core business metrics where consistency and precision are of key importance, not for exploratory use cases or ad hoc analysis. Our shorthand way of determining whether the metric should be defined in dbt has been - *is this something our teams need to report on?* - -So, let’s say the CFO of our Jaffle comes to us on a Monday morning and commands the data team to overhaul how we're reporting on Revenue. Our Regional Manager Jim and Sales Director Pam[^3] have been giving him different reports! Right now its a mess of tools and inconsistencies - Jim’s numbers are defined in Tableau and say one thing, Pam’s within Hex and say another! The CFO is frustrated with it and wants a cohesive experience across the company where everyone has the same numbers for revenue. It passes the report test, it’s an important business metric; away we go! - -**Defining the Metric with Metric Node** - -In this example, we’ll say that both Jim and Pam are pulling from a table created by dbt called `orders`. It currently contains fields for `amount` and all different methods of payment_amounts, such as credit cards or gift cards. Jim has been calculating revenue by summing up the `credit_card_amount` and `gift_card_amount` fields, as he forgot to update his definition when the business added coupons and bank transfers payments. Meanwhile, Pam is correctly summing the `amount` field but hasn’t accounted for return orders that shouldn’t be counted! - -The first step is creating a unified definition for what revenue is. In order to do this, we will create the following yml definition within our dbt repo: - -```yaml -version: 2 - -metrics: - - name: revenue - label: Revenue - model: ref('orders') - description: "The total revenue of our jaffle business" - - type: sum - sql: amount - - timestamp: order_date - time_grains: [day, week, month, year] - - dimensions: - - customer_status - - has_coupon_payment - - has_bank_transfer_payment - - has_credit_card_payment - - has_gift_card_payment - - filters: - - field: status - operator: '=' - value: "'completed'" -``` - -This metric has now been defined in the dbt metadata and can be seen in the DAG! - -![Untitled](/img/blog/2022-07-27-getting-started-with-the-dbt-semantic-layer/metrics-dag.png) - -**Running The Metric Package To calculate the metric** - -In order to ensure that both Jim and Pam are retrieving the same numbers for their metric, we’ll need them to both run a metrics `calculate` query. In this example, we’re not interested in the specific payment types and only want to see revenue broken up by `week` and `customer_status`. - -```sql -select * -from {{ metrics.calculate( - metric('revenue'), - grain='week', - dimensions=['customer_status'] -) }} -``` -This would return a dataset that looks like this: - -| date_week | customer_status | revenue | -| --- | --- | --- | -| 2018-01-01 | Churn Risk | 43 | -| 2018-01-01 | Churned | 0 | -| 2018-01-01 | Healthy | 26 | -| 2018-01-08 | Churn Risk | 27 | - -Jim and Pam would then be able to reference the `revenue` column within the newly created dataset and never have to worry about the calculation of revenue ever again[^4]! The world is perfect and [balance has been restored.](https://www.youtube.com/watch?v=d1EnW4kn1kg) - -**In the near future with dbt Server** - -When dbt Server releases later this year, the flow of how metrics are consumed will change significantly. Your organization will no longer need to materialize each metric within a model in order to take advantage of the metric definition. Instead, you’ll be able to directly query dbt Server with the metric code provided and have the correct dataset returned to your BI tool of choice. - -Additionally, integration partners will have built out experiences around Metrics using the Metadata API to create unique and creative ways for consumers to obtain metric data while abstracting away complexity. For example, a box that allows the user to select from a list of metrics, time grains, dimensions, and secondary calculation and then have the correct information returned to them regardless of the selection! - -### So what is publicly available now? - -Right now, the two main open-source components that are publicly available are the [`metric` node](https://docs.getdbt.com/docs/building-a-dbt-project/metrics) within dbt Core and the `dbt_metrics` package. Combined, these two can operate an introductory semantic layer experience by allowing analytics engineers to define metrics and then query that metric via the metrics package. - -These two components are a static experience that have to be defined in the dbt project (as the selected dimensions are defined at model creation) but are useful for those who want to ensure that metrics remain consistent across every BI tool. If you identify with any of the following conditions, you could be a good fit for implementing this as it exists today: - -- You want to prepare your organization for the full Semantic Layer launch. -- Your organization has at least a few key metrics -- Your organization uses 1 or more BI tools -- Your organization occasionally has issues around different metric calculations -- Your organization wants a centralized location for all metrics so everyone in the business knows where to look - -All of these are great reasons to begin exploring implementing metrics in your dbt project! If you’re curious about what an implementation of this might look like, we recommend referencing the [jaffle_shop_metrics](https://github.com/dbt-labs/jaffle_shop_metrics) repo! - -### What is still in development? - -Both the dbt Cloud proxy server and dbt Server are currently in development, with a scheduled release of later this year. If you’re curious about testing them once they are released, we recommend keeping an eye on our product announcements and then reaching out once they become publicly available! - -### What if I have questions? - -If you have any questions about those components, or metrics in general, please feel free to post in the #dbt-metrics-and-server channel on dbt Slack! I hang around there and am always willing to chat metrics! - -### Footnotes -[^1]: That future may not have mentioned robots but I'm holding out for [Jetson's style morning machine](https://www.youtube.com/watch?v=-0S3Jf-NxdI) to help me get ready in the morning. - -[^2]: We’re specifically calling out the licensing because there is a lot of confusion in the community around what is open-source and what isn’t. This is only becoming trickier with the introduction of the BSL licensing, which ensures users can run their own server but it cannot be sold as a cloud service. For more information on why these licensing types were picked, we recommend [Tristan’s blog around licensing dbt.](https://www.getdbt.com/blog/licensing-dbt/). The big takeaway around licensing is that you can still run components of the dbt Semantic Layer even if you aren’t a dbt Cloud customer! - -[^3]: Full transparency, I've never seen the Office. The awkward humor makes me so uncomfortable that I have to turn off the TV. Apologies if the titles of the characters are incorrect. - -[^4]: Psych! They’re definitely interested in the calculation of ARR. In fact, they don’t really trust the numbers **unless** they understand how it’s calculated. This is where they could use the Metadata API in order to query all the information about the metric, such as definition, run-time, acceptable dimensions, etc. Right now Jim and Pam would need to query the API directly but in the future we expect there to be a number of different ways to obtain this information, ranging from [direct integration with the BI tool](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration) all the way to having that information materialized in a dbt information schema! *For current tabular alternatives, there are some interesting macros in the newly released [dbt-project-evaluator package](https://github.com/dbt-labs/dbt-project-evaluator). Take a look there if you’re curious about materializing your metric information!* \ No newline at end of file diff --git a/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md b/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md index 09cb9c4c178..020a48c763f 100644 --- a/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md +++ b/website/blog/2022-08-12-how-we-shaved-90-minutes-off-long-running-model.md @@ -22,7 +22,7 @@ The dbt Labs internal project is a beast! Our daily incremental dbt Cloud job ru ![Model Timing tab before picture](/img/blog/2022-08-12-model-timing/model_timing_before.png) -As you can see, it's straightforward to identify the model that's causing the long run times and holding up other models. The model `fct_dbt_invocations` takes, on average, 1.5 hours to run. This isn't surprising, given that it's a relatively large dataset (~5B records) and that we're performing several intense SQL calculations. Additionally, this model calls an [ephemeral model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations#ephemeral) named `dbt_model_summary` that also does some heavy lifting. Still, we decided to explore if we could refactor this model and make it faster. +As you can see, it's straightforward to identify the model that's causing the long run times and holding up other models. The model `fct_dbt_invocations` takes, on average, 1.5 hours to run. This isn't surprising, given that it's a relatively large dataset (~5B records) and that we're performing several intense SQL calculations. Additionally, this model calls an [ephemeral model](https://docs.getdbt.com/docs/build/materializations#ephemeral) named `dbt_model_summary` that also does some heavy lifting. Still, we decided to explore if we could refactor this model and make it faster. After refactoring this code, we ended up swapping the ephemeral model `dbt_model_summary` to an incremental model that took the bulk of the processing out of the main `fct_dbt_invocations` model. Instead of recalculating this complex logic every run, we pull only new data and run that logic on the smaller subset of those records. The combined run time of the new `dbt_model_summary` and `fct_dbt_invocations` is now ~15-20 minutes, a savings of over an hour per run! @@ -121,11 +121,11 @@ The window functions referenced above are answering the following questions: Given the size and complexity of this query, the first few approaches we took didn’t focus on changing the query as much as optimizing our objects and materializations. -The two window functions (`row_number()` and `mode()` in the `diffed` above) were in an [ephemeral model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations#ephemeral) which isn’t stored in the data warehouse, but is instead executed in-memory at run time. Since it was obvious our virtual warehouse was running out of memory (remote storage spillage), we tried swapping that to a view, then a table materialization. Neither of these improved the run time significantly, so we tried clustering the table. However, since our two window functions are at different grains there wasn’t a great clustering key we found for this. +The two window functions (`row_number()` and `mode()` in the `diffed` above) were in an [ephemeral model](https://docs.getdbt.com/docs/build/materializations#ephemeral) which isn’t stored in the data warehouse, but is instead executed in-memory at run time. Since it was obvious our virtual warehouse was running out of memory (remote storage spillage), we tried swapping that to a view, then a table materialization. Neither of these improved the run time significantly, so we tried clustering the table. However, since our two window functions are at different grains there wasn’t a great clustering key we found for this. ### Attempt #2: Moving to an incremental model -The final strategy we tried, which ended up being the solution we implemented, was to swap the ephemeral model (`dbt_model_summary`) to an [incremental model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models). Since we’re calculating metrics based on historical events (**first** model run, most frequent model run **today**), an incremental model let us perform the calculation for all of history once in an initial build, then every subsequent build only needs to look at a much smaller subset of the data to run it’s calculations. +The final strategy we tried, which ended up being the solution we implemented, was to swap the ephemeral model (`dbt_model_summary`) to an [incremental model](https://docs.getdbt.com/docs/build/incremental-models). Since we’re calculating metrics based on historical events (**first** model run, most frequent model run **today**), an incremental model let us perform the calculation for all of history once in an initial build, then every subsequent build only needs to look at a much smaller subset of the data to run it’s calculations. One of the biggest problems with the ephemeral model was remote spillage due to lack of memory, so having a smaller dataset to run the calculation against made a massive impact. Snowflake can easily calculate a daily mode or a first model run when we only had to look at a sliver of the data each time. diff --git a/website/blog/2022-08-17-managing-surrogate-keys-in-dbt.md b/website/blog/2022-08-17-managing-surrogate-keys-in-dbt.md index cae25bc4b96..3f8e12f3a6f 100644 --- a/website/blog/2022-08-17-managing-surrogate-keys-in-dbt.md +++ b/website/blog/2022-08-17-managing-surrogate-keys-in-dbt.md @@ -20,7 +20,7 @@ Sometimes, we are lucky enough to have data sources with these keys built right ## How were surrogate keys managed in the past? -Before the advent of the analytical warehouse tools we use today, the data warehouse architecture had a few key constraints that led to the rise of the Kimball-style warehouse with a snowflake schema. This was because storage was expensive — it was more efficient to store data as few times as possible, and rely on joins to connect data tog ether when a report required it. And to make those joins efficient, it became standard practice to use **monotonically increasing integer surrogate keys (MIISKs)**, a fancy way to say “count each record starting at one” so that your data model would look something like this (you are a cheesemonger): +Before the advent of the analytical warehouse tools we use today, the data warehouse architecture had a few key constraints that led to the rise of the Kimball-style warehouse with a snowflake schema. This was because storage was expensive — it was more efficient to store data as few times as possible, and rely on joins to connect data tog ether when a report required it. And to make those joins efficient, it became standard practice to use ** integer surrogate keys (MIISKs)**, a fancy way to say “count each record starting at one” so that your data model would look something like this (you are a cheesemonger): | product_id | product_name | created_by | created_at | | --- | --- | --- | --- | @@ -194,7 +194,7 @@ agg as ( final as ( select - {{ dbt_utils.surrogate_key([ + {{ dbt_utils.generate_surrogate_key([ 'report_date', 'user_id' ]) @@ -216,7 +216,7 @@ The analytical warehouses we use now no longer have the same constraints that tr This strategy is not without its caveats either! -- **Collisions -** Although it's *exceedingly* rare, depending on the hashing algorithm you use, it's possible for two different sets of inputs to produce the same outputs, causing erroneous duplicate records in your dataset. Using an MD5 hash (the default for the `dbt_utils.surrogate_key` macro), you have a 50% of a collision when you get up to 2^64 records (1.84 x 10E19 aka a whole lot of data). While [very very very unlikely](https://docs.getdbt.com/terms/surrogate-key#a-note-on-hashing-algorithms), it’s certainly something to consider for truly massive datasets. +- **Collisions -** Although it's *exceedingly* rare, depending on the hashing algorithm you use, it's possible for two different sets of inputs to produce the same outputs, causing erroneous duplicate records in your dataset. Using an MD5 hash (the default for the `dbt_utils.generate_surrogate_key` macro), you have a 50% of a collision when you get up to 2^64 records (1.84 x 10E19 aka a whole lot of data). While [very very very unlikely](https://docs.getdbt.com/terms/surrogate-key#a-note-on-hashing-algorithms), it’s certainly something to consider for truly massive datasets. - **Datatypes -** If you’re in the process of migrating legacy code to a new warehouse provider, you likely have some constraints on the datatype of your keys from the consumers of your datasets, and may have some issues converting to a string-based key. Luckily, some warehouse providers have hash functions that output integer values (like Snowflake’s `MD5_UPPER/LOWER_64` functions). However, these have fewer bits in the hashing function, so may lead to collision issues on big data sets. - **Performance -** Hashed keys generally result in long string-type values. On massive datasets on some warehouses, this could cause some performance issues. Unlike MIISKs, string values can’t be easily partitioned to improve query performance. Luckily, as described in the above bullet point, you can choose to utilize hashing functions that output other, more performant datatypes! - **Storage -** As mentioned above, hash keys will end up with higher storage costs than their MIISK counterparts. Given that the cost of storage in cloud warehouses is extremely cheap, it’s unlikely to be worth the effort to optimize for storage costs. diff --git a/website/blog/2022-08-22-narrative-modeling.md b/website/blog/2022-08-22-narrative-modeling.md index 71252279873..a5418ccded1 100644 --- a/website/blog/2022-08-22-narrative-modeling.md +++ b/website/blog/2022-08-22-narrative-modeling.md @@ -173,7 +173,7 @@ To that final point, if presented with the DAG from the narrative modeling appro ### Intangible business steps are easier to model - Knowledge gaps are captured accurately. For instance, if the best way you know that a shipment was received by a customer is that a truck driver scanned it out of the system, you can model `shipment_scanned_out` as an explicit model, followed by an implicit`shipment_received` model referencing it. This stores in code the company’s current point-of-view that the scanning action is the best information available -- Certain business decisions directly drive data transformations. If an entire package costs $50.00 to ship and it has multiple items inside, shipping cost could be attributed to each item via weight or product value. In either case, teams can capture this attribution as `item_apportioned_shipping_cost`. +- Certain business decisions directly drive [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). If an entire package costs $50.00 to ship and it has multiple items inside, shipping cost could be attributed to each item via weight or product value. In either case, teams can capture this attribution as `item_apportioned_shipping_cost`. ### Users can tie business concepts to source data diff --git a/website/blog/2022-08-22-unit-testing-dbt-package.md b/website/blog/2022-08-22-unit-testing-dbt-package.md index 80c0ad639a7..466336a0738 100644 --- a/website/blog/2022-08-22-unit-testing-dbt-package.md +++ b/website/blog/2022-08-22-unit-testing-dbt-package.md @@ -12,7 +12,7 @@ is_featured: true _Editors note - this post assumes working knowledge of dbt Package development. For an introduction to dbt Packages check out [So You Want to Build a dbt Package](https://docs.getdbt.com/blog/so-you-want-to-build-a-package)._ -It’s important to be able to test any dbt Project, but it’s even more important to make sure you have robust testing if you are developing a [dbt Package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management). +It’s important to be able to test any dbt Project, but it’s even more important to make sure you have robust testing if you are developing a [dbt Package](https://docs.getdbt.com/docs/build/packages). I love dbt Packages, because it makes it easy to extend dbt’s functionality and create reusable analytics resources. Even better, we can find and share dbt Packages which others developed, finding great packages in [dbt hub](https://hub.getdbt.com/). However, it is a bit difficult to develop complicated dbt macros, because dbt on top of [Jinja2](https://palletsprojects.com/p/jinja/) is lacking some of the functionality you’d expect for software development - like unit testing. @@ -109,9 +109,9 @@ Your dbt Package may support multiple adapters. If you are a postgres user, you {% endmacro %} -{% macro postgress__to_literal(text) %} +{% macro postgres__to_literal(text) %} - '{{- text -}}' + E'{{- text -}}' {% endmacro %} ``` diff --git a/website/blog/2022-08-31-august-product-update.md b/website/blog/2022-08-31-august-product-update.md index df326039f25..cb4077f3a06 100644 --- a/website/blog/2022-08-31-august-product-update.md +++ b/website/blog/2022-08-31-august-product-update.md @@ -4,7 +4,6 @@ description: "Coalesce is less than 2 months away!" slug: dbt-product-update-2022-august authors: [lauren_craigie] -tags: [dbt updates] hide_table_of_contents: false date: 2022-08-31 @@ -23,7 +22,7 @@ You’ll hear more in [Tristan’s keynote](https://coalesce.getdbt.com/agenda/k ## **What's new** -- **dbt Core v1.3 beta:** Do you use Python for analytics? The first beta prerelease of dbt Core v1.3—including support for dbt models written in Python—is [ready to explore](https://docs.getdbt.com/guides/migration/versions/upgrading-to-v1.3)! Check it out, and read more about dbt supported Python models [in our docs](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models). +- **dbt Core v1.3 beta:** Do you use Python for analytics? The first beta prerelease of dbt Core v1.3—including support for dbt models written in Python—is [ready to explore](https://docs.getdbt.com/guides/migration/versions/upgrading-to-v1.3)! Check it out, and read more about dbt supported Python models [in our docs](/docs/build/python-models). - **Technology Partner Program:** We just launched our new [Technology Partner Program](https://www.getdbt.com/blog/dbt-labs-technology-partner-program/) with 40+ friends in the Modern Data Stack to provide consistent support for seamless integrations joint-users can trust. Check our new [dbt Cloud integrations page](http://www.getdbt.com/product/integrations) for what’s available today! - **Single-tenant users:** dbt Cloud v1.1.60 is now available on dbt Cloud Enterprise. @@ -54,7 +53,7 @@ You’ll hear more in [Tristan’s keynote](https://coalesce.getdbt.com/agenda/k I just discovered the treasure trove of excellent resources from dbt Labs consulting partners, and want to start sharing more here. Here’s a few you might have missed over the summer: -- **Reduce ETL costs:** I’ve only just seen [this blog](https://www.mighty.digital/blog/how-dbt-helped-us-reduce-our-etl-costs-significantly) from Mighty Digital, but found it to be a super practical (and concise) introductory guide to rethinking your ETL pipeline with dbt. +- **Reduce ETL costs:** I’ve only just seen [this blog](https://www.mighty.digital/blog/how-dbt-helped-us-reduce-our-etl-costs-significantly) from Mighty Digital, but found it to be a super practical (and concise) introductory guide to rethinking your ETL pipeline with dbt. - **Explore data:** [Part two of a series on exploring data](https://vivanti.com/2022/07/28/exploring-data-with-dbt-part-2-extracting/) brought to you by Vivanti. This post focuses on working with objects in dbt, but I also recommend the preceding post if you want to see how they spun up their stack. - **Track historical changes:** [](https://blog.montrealanalytics.com/using-dbt-snapshots-with-dev-prod-environments-e5ed63b2c343)Snapshots are a pretty handy feature for tracking changes in dbt, but they’re often overlooked during initial onboarding. [Montreal Analytics explains how to set them up](https://blog.montrealanalytics.com/using-dbt-snapshots-with-dev-prod-environments-e5ed63b2c343) in dev/prod environments - **Learn dbt:** Have some new faces on the data team that might need an introduction to dbt? Our friends at GoDataDriven are hosting a [virtual dbt Learn Sept 12-14](https://www.tickettailor.com/events/dbtlabs/752537). diff --git a/website/blog/2022-09-08-konmari-your-query-migration.md b/website/blog/2022-09-08-konmari-your-query-migration.md index 62e0f613a56..f7d7cc74ead 100644 --- a/website/blog/2022-09-08-konmari-your-query-migration.md +++ b/website/blog/2022-09-08-konmari-your-query-migration.md @@ -114,7 +114,7 @@ We are ready to unpack our kitchen. Use your design as a guideline for [modulari - Important, reusable joins that are performed in the final query should be moved upstream into their own modular models, as well as any joins that are repeated in your query. - Remember that you don’t want to make these intermediate tables *too* specific. Don’t apply filters if it causes the model to be consumable by only one query downstream. If you do this, you aren’t creating a scalable project, you’re just recreating the same issue as your original query, but spread amongst mulitple models, which will be hard to untangle later. -Your final query should be concretely defined — is it a fact or dimension table? Is it a report table? What are the stepping stones to get there? What’s the most performant way to [materialize](/docs/building-a-dbt-project/building-models/materializations)? +Your final query should be concretely defined — is it a fact or dimension table? Is it a report table? What are the stepping stones to get there? What’s the most performant way to [materialize](/docs/build/materializations)? Build with the goal to scale — when might you need these intermediate models again? Will you need to repeat the same joins? Hopefully you’ve designed with enough intention to know the answer to that last one is “no.” Avoid repeating joins! diff --git a/website/blog/2022-09-28-analyst-to-ae.md b/website/blog/2022-09-28-analyst-to-ae.md index b89638520d9..7c8ccaeabec 100644 --- a/website/blog/2022-09-28-analyst-to-ae.md +++ b/website/blog/2022-09-28-analyst-to-ae.md @@ -38,7 +38,7 @@ Analysts are experts at taking broad statements and turning them into specific p 1. Users with a session longer than a certain amount of time 1. Users that interacted with a particular feature -From there, the analyst will build out the initial documentation and write down what they want the final dataset to look like. If your analyst is not trained as an analytics engineer, this is the point that they will need to hand the project over to a data engineer to build the [model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models). +From there, the analyst will build out the initial documentation and write down what they want the final dataset to look like. If your analyst is not trained as an analytics engineer, this is the point that they will need to hand the project over to a data engineer to build the [model](/docs/build/models). The first time that a team of analysts and data engineers builds a curated dataset, they will often expect this process to be a straight line to completion. Expectations are that the process will look something like this: @@ -133,7 +133,7 @@ It’s much easier to keep to a naming guide when the writer has a deep understa If we want to know how certain logic was built technically, then we can reference the SQL code in dbt docs. If we want to know *why* a certain logic was built into that specific model, then that’s where we’d turn to the documentation. -- Example of not-so-helpful documentation ([dbt docs can](https://docs.getdbt.com/docs/building-a-dbt-project/documentation) build this dynamically): +- Example of not-so-helpful documentation ([dbt docs can](https://docs.getdbt.com/docs/collaborate/documentation) build this dynamically): - `Case when Zone = 1 and Level like 'A%' then 'True' else 'False' end as GroupB` - Example of better, more descriptive documentation (add to your dbt markdown file or column descriptions): - Group B is defined as Users in Zone 1 with a Level beginning with the letter 'A'. These users are accessing our new add-on product that began in Beta in August 2022. It's recommended to filter them out of the main Active Users metric. diff --git a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md b/website/blog/2022-10-12-how-to-design-and-structure-metrics.md deleted file mode 100644 index c2f58011cfe..00000000000 --- a/website/blog/2022-10-12-how-to-design-and-structure-metrics.md +++ /dev/null @@ -1,394 +0,0 @@ ---- -title: "How to design and structure dbt metrics: Recommendations for getting started" -description: "The introduction of the dbt Semantic Layer expands what users can do with dbt but introduces a familiar questions around where logic should live. Read along as the dbt Labs team talks about best practices through the lens of two different examples!" -slug: how-to-design-and-structure-metrics - -authors: [callum_mccann] - -tags: [dbt product updates] -hide_table_of_contents: false - -date: 2022-10-12 -is_featured: true ---- - ---- - -**IMPORTANT:** This document serves as the temporary location for information on how to design and structure your metrics. It is our intention to take this content and turn it into a Guide, like [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), but we feel that codifying information in a Guide first requires that metrics be rigorously tested by the community so that best practices can arise. This document contains our early attempts to create best practices. In other words, read these as suggestions for a new paradigm and share in the community where they do (or don’t) match your experiences! You can find more information on where to do this at the end. - ---- - -## The power of a semantic layer on top of a mature data modeling framework - -As a longtime [dbt Community](https://www.getdbt.com/community/join-the-community/) member, I knew I had to get involved when I first saw the dbt Semantic Layer in the now infamous [`dbt should know about metrics` Github Issue](https://github.com/dbt-labs/dbt-core/issues/4071). It gave me a vision of a world where metrics and business logic were unified across an entire organization; a world where the data team was no longer bound to a single consuming experience and could enable their stakeholders in dozens of different ways. To me, it felt like the opportunity to contribute to the next step of what dbt could become. - -In past roles, I’ve been referred to as the `dbt zealot` and I’ll gladly own that title! It’s not a surprise - dbt was built to serve data practitioners expand the power of our work with software engineering principles. It gave us flexibility and power to serve our organizations. But I always wondered if there were more folks who could directly benefit from interacting with dbt. - -The Semantic Layer expands the reach of dbt **by coupling dbt’s mature data modeling framework with semantic definitions.** The result is a first of its kind data experience that serves both the data practitioners writing your analytics code and stakeholders who depend on it. Metrics are the first step towards this vision, allowing users to version control and centrally define their key business metrics in a single repo while also serving them to the entire business. - -However, this is still a relatively new part of the dbt toolbox and you probably have a lot of questions on how exactly you can do that. This blog contains our early best practice recommendations for metrics in two key areas: -- **Design**: What logic goes into metrics and how to use calculations, filters, dimensions, etc. -- **Structure**: Where these metrics will live in your dbt project and how to compose the files that contain your metrics - -We developed these recommendations by combining the overall philosophy of dbt, with our hands-on learning gathered during the beta period and internal testing. - - - -**Pre-reading:** We recommend reading through the [metrics documentation](/docs/building-a-dbt-project/metrics), which contains a table of all the required/optional properties. - -### When to put business logic in the semantic layer vs the modeling layer - -Our instinct when designing metrics might be to encode as much information as possible into the semantic layer. An example of this is case statements - the analytics engineer’s gut instinct might be to mimic tools of the past and provide complicated case statements for the metric `expression` property to try and capture the nuance of how it should be calculated. - -But remember - you always have the option of performing this logic _in the modeling layer_. This is the key difference between dbt and other semantic layer offerings - by sitting the semantic layer atop a mature transformation layer, you always have the option to configure and optimize your logic within your models and then _define semantic components with intentionality_. - -Getting the balance just right is a learning experience and developing community best practices and standards will take time, which is why it’s important for us to think from first principles. What should really be our goal when determining whether logic lives in a model or a metrics? - -To explore this question and begin to develop an intuition, we’ll walk through two examples of handling this divide. - -## Basic example: Revenue - -### Designing your metrics - -In this example, we’ll cover the basics of defining a metric and a fairly straightforward example of where users can draw the line between the semantic layer and the modeling layer. You should finish this section with a better understanding of dbt metrics and its relationship to the modeling layer. - -In the past, the `marts` tables were often your end stage layer before data was consumed in another tool or system. Now, the mart is the springboard for the creation of our metric. So we'll begin by looking our end-state `marts` model called `order_events` that looks something like the below table, but on the order of millions of rows instead of five. Our finance team uses the below model to better understand revenue but inconsistencies in how it's reported have led to requests that the data team centralize the definition in the dbt repo. - -| event_date | order_id | order_country | order_status | customer_id | customer_status | amount | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | United States | completed | 19 | Healthy | 10 | -| 2022-10-01 | 2 | France | completed | 36 | Churn Risk | 15 | -| 2022-10-02 | 2 | France | returned | 36 | Churned | 15 | -| 2022-10-02 | 3 | Turkey | completed | 20 | Healthy | 80 | -| 2022-10-03 | 4 | Korea | completed | 14 | Churn Risk | 24 | - -### Logic in the modeling layer vs the semantic layer - -We know from our upstream dbt models that the `amount` field represents the revenue from from each order. The inconsistent reporting, however, has arisen because the correct definition of revenue only refers to orders that are completed, not returned. Some teams aren’t familiar with this additional filter and it has led to company wide misreporting. - -The solution is to use the flexibility of the dbt modeling layer to add a boolean field called `is_active_row` that shows whether or not the row in question is the most recent version. With this, we can understand and filter out duplicate rows that may be connected to the same order. - -Once we have this field, we reach a diverging path: - -- If we are not interested in seeing the history of `order_events` , we can add a `where` clause **to the model itself**. This would ensure there is only one row per order. -- If we **are** interested in seeing the history of `order_events` , we can add a `filter` to the metric definition to ensure that these duplicate order rows don’t cause us to misreport revenue - -Both of these paths ensure that only the correct orders are included in the metric calculation but one does it at the modeling layer and the other the semantic layer. There is no **best** path here - it depends on your organization's reporting and analytics needs. For this example, we’ll say that our business isn’t interested in understanding orders that have gone from completed to returned and so we’ll use option one moving forward. Now lets define the metric: - -```yaml -version: 2 -metrics: - - name: total_revenue - label: The revenue of our business - model: ref('order_events') - description: "The revenue for our business, as defined by Jerry in Finance" - - calculation_method: sum - expression: amount - - timestamp: event_date - time_grains: [day, week, month, all_time] - - dimensions: - - customer_status - - order_country - - ## We don't need this section because we chose option 1 - ## filters: - ## - field: order_status - ## operator: '=' - ## value: 'completed -``` - -Each of the properties of the above definition are defined [in the metrics documentation](https://docs.getdbt.com/docs/building-a-dbt-project/metrics), but let’s dig into the two that might require some additional explanation. The two in question are `expression` and `dimensions`. - -In plain english, the `expression` property is the sql column (or expression) that we are applying the calculation method on. In our example above, this simplifies to `sum(amount)`. However, this doesn’t **need** to be a field in the model. It could also be a sql expression like `case when condition = true then field else 0 end` . - -And then there’s `dimensions`. - -### Choosing which dimensions to use with your metric - -The `dimensions` attribute is a bit more nuanced than the others because it involves curating the ways through which a user can interact with the metric. To that end … - -❗ **We recommend curating dimensions, not including all columns within the model. Most models contain dimensions that aren’t relevant for end-user analysis.** - -What do we mean? Well, there is a lot of nuance in what constitutes a useful or less useful dimension that is dependent on the shape of the underlying data and the ways with which the metric will be consumed. Continuing with our revenue use case, here are some examples: - -- **Useful Dimensions:** - - `customer_status`: This field is helpful to end users because it allows them to break down the revenue generated by each customer status grouping. Members of the retention team might be interested in understanding the long-term trends of revenue from the Churn Risk group so that they can better understand the impact that their retention initiatives campaigns have had. - - `order_country`: This field is useful because it allows members of the finance team to break down the accepted revenue from each country of origin so that they can better understand which countries are experiencing the highest growth. -- **Less Useful Dimensions:** - - `order_status` : Given that order_status is a part of the metric definition, it doesn’t make sense to include in the acceptable dimensions list because the value returned would always be `completed`. - - `order_id`: Each order id corresponds to a single order and a single point in time. Grouping the metric of revenue by order_id would just return the base grain of the table and the same value as the amount field - not useful from a metric perspective! -- **Nuanced Dimensions:** - - `customer_id`: This is an interesting field because it can be both good and bad depending on the context in which it is used and the underlying data. In our example use case, this dimension wouldn’t be that useful - it would contain too many unique values and tracking the individual revenue impact by a single customer doesn’t make sense on a retail scale. - - In a SaaS business though, it might make more sense - especially with usage based pricing. The Customer Success team might be interested in tracking the revenue of certain customers and ensuring that they remain consistent. - -To quote Cameron Afzal, Product Manager of the dbt Semantic Layer: - -> Thoughtful curation of dimensions is essential for three main reasons: -- **Relevance:** Analysts must include the dimensions most relevant to answering the question. -- **Trust**: Curating high-quality dimensions with little to no known errors helps ensure trust in analysis results and the decisions that follow. -- **Efficiency**: Curation provides a faster path to high-quality analysis results. -> - -To put it another way, **metrics are most useful when every dimension provided can help provide answers to the business.** - -## Advanced example: NPS - -### Designing a complex metric - -Now let’s look at a more complex example of a metric - one that is built from components that could theoretically themselves be metrics. The metric in question is Net Promoter score, which is used by the dbt Labs internal analytics team to understand the experience that users are having on dbt Cloud. - -For those of you who are unfamiliar with the industry metric of Net Promoter Score, here is a [great article from the folks over at Delighted on how it is calculated.](https://delighted.com/net-promoter-score) The short version of it is `the percentage of promoters - the percentage of detractors`. - ---- - -Here at dbt Labs we provide users with short surveys where they can provide feedback (as well as in a few other locations). The data is collected from those surveys is used to calculate our NPS Score, which helps us understand user sentiment over time. - -Given that these surveys come from a few different sources, there is a large amount of upstream modeling performed in order to unify them in a single model, but the end result is something that looks like the table below: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | -| --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 8 | promoter | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | - -The dbt Internal Analytics team ([long may they reign](https://www.linkedin.com/feed/update/urn:li:activity:6962884130569080833/)) took this data and decided to build the NPS Score metric into our repo so that it could be surfaced to stakeholders in multiple tools. This process is where we began to form our opinions on what should live in the modeling layer vs semantic layer - but these are sure to progress as we add in more and more real world use cases. - -### Option 1: Putting everything in the semantic layer - -If we wanted to store all the logic inside metric definitions, we could use the following code in the Semantic Layer section to create 6 different metrics that result in the NPS Score metric. This would allow end users to retrieve the NPS Score they are interested in a version-controlled, standard way across any of their BI tools of choice. Additionally, it allows users to individually slice/dice any of the component metrics by themselves. - -```yaml -metrics: - - name: total_respondents - label: Total of NPS Respondents - model: ref('customer_nps') - description: 'The count of users responding to NPS surveys in dbt Cloud.' - calculation_method: count - expression: unique_id - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: total_promoter_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'promoter'" - - - name: total_detractor_respondents - ......... ##same as total_respondents - filters: - - field: nps_category - operator: '=' - value: "'detractor'" - - - name: promoters_pct - label: Percent Promoters (Cloud) - description: 'The percent of dbt Cloud users in the promoters segment.' - calculation_method: expression - expression: "{{metric('total_promoter_respondents')}} / {{metric('total_respondents')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - - - name: detractor_pct - ... ##same as promoters_pct - expression: "{{metric('total_detractor_respondents')}} / {{metric('total_respondents')}}" - - - name: nps_score - label: Net Promoter Score - description: 'The NPS (-1 to 1) of all dbt Cloud users.' - calculation_method: expression - expression: "{{metric('promoters_pct')}} - {{metric('detractors_pct')}}" - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type - -``` - -### Option 2: Keeping logic in the modeling layer - -But what if we didn’t want to encode all that information in the metric definitions? If we didn’t need the ability to dig into the component metrics and only wanted to look at the final score? In that case, we could encode most of the logic into the model itself and define the metric on top of that! - -Thinking through this, we know that our NPS Score is a series of ratios dependent on conditions of which category people fall into with the end result being a number between 100 to -100. That number is usually then *displayed* in a percentage format but it is *calculated* as a number. - -So in order to reduce the complexity of metric code, we can add a new field into the model that assigns an `nps_value` to each survey received. The logic for this field would assign a value of 100, 0, or -100 depending on the survey’s `nps_category`. Example code below: - -```sql -case - when nps_category = 'detractor' then -100 - when nps_category = 'promoter' then 100 - else 0 -end as nps_value -``` - -The end result of adding this code to the model would look something like this: - -| feedback_date | unique_id | feedback_source | user_type | account_plan | score | nps_category | nps_value | -| --- | --- | --- | --- | --- | --- | --- | --- | -| 2022-10-01 | 1 | nps_tool_1 | developer | team | 5 | detractor | -100 | -| 2022-10-01 | 2 | nps_tool_2 | read_only | developer | 9 | promoter | 100 | -| 2022-10-02 | 3 | nps_tool_1 | developer | enterprise | 10 | promoter | 100 | -| 2022-10-02 | 4 | nps_tool_1 | developer | developer | 7 | passive | 0 | -| 2022-10-02 | 5 | nps_tool_2 | developer | team | 9 | promoter | 100 | -| 2022-10-03 | 6 | nps_tool_1 | developer | enterprise | 7 | passive | 0 | - -Now that each survey has an associated `nps_value` we can forgo the ratio calculations used in the Metric Logic section and create our NPS Score metric as a single average metric. - -```yaml -metrics: - - name: nps_score - label: NPS Score - model: ref('customer_nps') - calculation_method: average - expression: nps_value - timestamp: created_at - time_grains: [day, month, quarter, year] - dimensions: - - feedback_source - - account_plan - - user_type -``` - -
- Why does this work? - -This is a slightly different way of calculating NPS from the usually provided formula but it ends up with the same result. Here is why: - -- `promoter_pct` was defined as `total_promoter_respondents` / `total_respondents` - - In our example dataset, this nets out to 3 / 6 = 50%. - - If we instead assign a value of 100 and take the average, it becomes 300 / 6 = 50. -- `detractor_pct` was defined as `total_detractor_respondents` / `total_respondents` - - In our example dataset, this nets out to 1 / 6 = 16.67%. - - If we instead assign a value of 100 and take the average, it becomes -100 / 6 = -16.67. -- Therefore, our `nps_score` follows suit: - - In our example dataset, 50% - 16.67% = 33.33% - - If we instead assign a value of 100 and take the average, it becomes 200 / 6 = 33.33 - -The underlying principle of why this works is based on the fact that averages divide the sum of the values in the set by their number. In more dbt friendly terms, what it really means is that average is creating the following equation: `sum(value)/count(*)`. In the first example implementation, we were doing roughly the same thing with multiple metric definitions - the only difference was our numerator was a count that assigned each row a value of 1. So if we duplicate that logic and give each row a value of 1 then we can create far fewer metrics. - -But that only gets us to the `promoter_pct` and `detractor_pct` metrics. In order to combine these both into a single metric definition, we needed to change the value that we assign. Given that the total range of values that the metric could output is -100 (all detractors) to 100 (all promoters) we can assign each of those categories that peak value, along with 0 for passives. This means that when the numerator is aggregated, it nets out promoters against detractors just like the documented equation does `promoter score - detractor score` . - -
- -**Is this what I should do?** - -[It depends!](https://twitter.com/SBinLondon/status/1413113782214266885) There will be times when it might be better to have logic stored in the modeling layer and there will be times when it might be better to have logic stored in the semantic layer. Our shorthand is to only include logic in the semantic layer if it is needed by our stakeholders - if they don't need to analyze the components, we keep them in the modeling layer. In the end, the needs of your business stakeholders should drive your decision on where to keep this logic. - -## How to structure your metrics - -Now that we’ve designed our metrics, let's move on to structuring them within our project. We'll examine the different ways to organize metrics and take a look at the pros and cons of several strategies. - -### Folder structure - -If you follow [dbt’s best practices for structuring your project](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), you will have a folder structure that looks similar to this: - -```yaml -models: - staging: - intermediate: - marts: -``` - -Your marts folder would most likely contain your end-state models ready for business consumption. Given that metrics are meant for business consumption, we are presented with two options - staying within the same framework or representing metrics as their own level. - -We recommend Option A (metrics within marts) but recognize that some people might prefer Option B (metrics within models). - -**A. Metrics within marts** - -Create a metrics folder within marts and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -**B. Metrics within models** - -Create a metrics folder within models and use this to contain all of your metric definitions. - -```yaml -models: - staging: - intermediate: - marts: - metrics: -``` - -### File structure - -Once you’ve decided ***where*** to put your metrics folder, you can now decide ***how*** you want to structure your metrics within this folder. Choose one of two methods for structuring metrics: - -**Option A: The all-in-one YML method** -This method follows a similar pattern to [dbt’s best practices around model structure](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). The introduction of the metrics folder is the only change from the standard best practice. - -In practice, the all-in-one YML method would look like the following: - -```yaml -## Metrics within Marts -models: - marts: - metrics: - - metrics.yml ------- -## Metrics within Models -models: - metrics: - - metrics.yml -``` - -**Option B: The single-metric-per-file method** -In this method, you create *one* yml file for *each* metric*.* Although this is an opinionated stance that differs from [dbt’s best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), here are some reasons why this **could** be useful: - -- Individual files are more easily discovered by new analytics engineers as your organization expands -- Individual files can more easily define specific code owners that may not be part of the data team. - -For example, Jerry from the Finance department is the code owner for the `revenue` metric definition and oversees it for the business. So, any change to this specific file would need Jerry’s sign-off. - -This can be tricky for code owners who aren’t familiar with your git flow, but it brings them into the chain of responsibility for the metric definition. It also helps them take ownership for reporting on this metric and creates a responsible party when definitions need to change. - -The single-file-code-owner method would look like this: - -```yaml -models: - metrics: - marts: - - revenue.yml - - average_order_value.yml - - some_other_metric_name.yml -``` - -### Folder and file structure is a preference, not a hard rule - -In the end, all of the structuring information above is just a recommendation. Your project probably has a defined convention in how nodes are organized, whether or not it follows dbt best practices, and you should continue to follow your own organizational practices. That said, we do recommend that metrics should be separate from model yml files. The reason? - -**Metrics are important business objects unto themselves and should live separate from the model definitions.** - -## A call to action - -This is just the beginning of dbt metrics and the Semantic Layer. We have a number of exciting ideas for expanding capabilities that we plan to begin work on in the coming months. However, we can’t do that without you. - -This semantic layer is a fundamental change to what it means to interact with dbt and ultimately most of the best practices will come from the dbt Community - folks like you. It does not matter if you consider yourself an "expert" on this - we want to talk to you and hear how you are using or would like to use metrics and the semantic layer. Y’all are going to be our guiding light to help us make sure that all the functionality we add helps **you** serve the needs of your business. - -If your experience with the Semantic Layer match what we’ve written in this post, and especially if they don’t, please share [comments and feedback in this Discourse Discussion](https://discourse.getdbt.com/t/how-to-design-and-structure-metrics/5040)! - -Additionally, I would invite you to join us over at #dbt-core-metrics on the dbt Slack where we’ll be posting updates, answering questions, discussing usage, and hopefully responding with the best emojis. diff --git a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md index 5a18890e906..bab92000a16 100644 --- a/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md +++ b/website/blog/2022-10-19-polyglot-dbt-python-dataframes-and-sql.md @@ -4,8 +4,7 @@ description: "Going polyglot is a major next step in the journey of dbt Core. It slug: polyglot-dbt-python-dataframes-sql authors: [doug_beatty] - -tags: [dbt product updates] +tags: [dbt tutorials] hide_table_of_contents: false date: 2022-10-18 @@ -59,7 +58,7 @@ Gaining **your own** sense of these differences will empower you to create your ## Comparing dataframe libraries -Before we get into our hands-on example, let’s take a look at the nuts and bolts of getting your project working with different dataframe types. Multiple data platforms and dataframe libraries are supported in dbt Core as of version 1.3, but not uniformly (see compatibility table below). See [here](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models#specific-data-platforms) for platform-specific setup instructions. +Before we get into our hands-on example, let’s take a look at the nuts and bolts of getting your project working with different dataframe types. Multiple data platforms and dataframe libraries are supported in dbt Core as of version 1.3, but not uniformly (see compatibility table below). See [here](/docs/build/python-models) for platform-specific setup instructions. | **Type of data frame** | **Snowflake** | **Databricks** | **BigQuery** | |----------------------------|------------------------------------|-----------------------------------|---------------| @@ -72,7 +71,7 @@ Before we get into our hands-on example, let’s take a look at the nuts and bol
More detailed comparisons and trade-offs -Snowpark DataFrames are only supported in Snowflake, while Spark DataFrames are only available on Databricks and BigQuery. It’s also worth keeping in mind that different types of dataframes use [different syntax](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models#dataframe-api-and-syntax). +Snowpark DataFrames are only supported in Snowflake, while Spark DataFrames are only available on Databricks and BigQuery. It’s also worth keeping in mind that different types of dataframes use [different syntax](/docs/build/python-models#dataframe-api-and-syntax). We’ve intentionally not included Jinja within Python models: a primary use of Jinja is for control flow and accessing dynamic-esque variables both of which you can just do within Python! The other main use for Jinja within dbt is for creating abstractions across differing database syntaxes. At this time, there’s no unified syntax abstraction across the different types of dataframes. (But someone will probably [make one](https://xkcd.com/927/)!) diff --git a/website/blog/2022-10-24-demystifying-event-streams.md b/website/blog/2022-10-24-demystifying-event-streams.md index 39829c3bca0..3fe689742a6 100644 --- a/website/blog/2022-10-24-demystifying-event-streams.md +++ b/website/blog/2022-10-24-demystifying-event-streams.md @@ -28,7 +28,7 @@ Under the hood, the Merit platform consists of a series of microservices. Each o ![](/img/blog/2022-10-24-demystifying-event-streams/merit-platform.png) -In the past we relied upon an ETL tool (Stitch) to pull data out of microservice databases and into Snowflake. This data would become the main dbt sources used by our report models in BI. +In the past we relied upon an tool (Stitch) to pull data out of microservice databases and into Snowflake. This data would become the main dbt sources used by our report models in BI. ![](/img/blog/2022-10-24-demystifying-event-streams/merit-platform-stitch.png) @@ -84,7 +84,7 @@ However, working with these blobs is much less convenient tha 1. One challenge is partial updates - we disallow those currently so that we never need to recreate the state of a domain model across multiple json blobs. 1. Distributed systems folks will identify another problem: relying on timing. Due to clock skew, we can’t guarantee that event A’s timestamp being earlier than another B’s means that A occurred before B. If both messages are sent on the same Kafka topic then Kafka can ensure ordering (if configured properly), but we don’t want to limit all events to using the same topic. So we choose to ignore this problem since we have relatively low traffic and low machine volume compared to the Googles and Facebooks of the world. We can also verify the likelihood of clock skew affecting our data by looking for events with the same identifying ID happening within the same second - it doesn’t happen often for us. -Instead of repeatedly working with the above challenges, we decided to create a relational layer on top of the raw event streams. This takes the form of [dbt macros](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) that handle all of the above problems. +Instead of repeatedly working with the above challenges, we decided to create a relational layer on top of the raw event streams. This takes the form of [dbt macros](/docs/build/jinja-macros) that handle all of the above problems. In order to make the dbt macros easier to write, we requested that engineering add some metadata to all of their events. This formalized the contract between engineering and data - any domain models that don’t comply with the contract will not be able to be used in reports unless the engineering team themself builds a custom pipeline. We named this the Obvious Model Generation (OMG) Contract since providing the metadata leads to obvious domain model generation. And we liked the acronym. @@ -266,7 +266,7 @@ models: We learned a lot from both working with event streams and building these macros. -One consideration that we haven’t discussed yet is [materialization](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations) strategy. Since event stream tables are append-only, this is a natural fit for incremental models. At Merit, we haven’t worked much with incremental models, so we’re opting to start with views. As we roll this out to production models we’ll be doing a ton of performance testing to figure out the perfect materialization strategy for us. +One consideration that we haven’t discussed yet is [materialization](https://docs.getdbt.com/docs/build/materializations) strategy. Since event stream tables are append-only, this is a natural fit for incremental models. At Merit, we haven’t worked much with incremental models, so we’re opting to start with views. As we roll this out to production models we’ll be doing a ton of performance testing to figure out the perfect materialization strategy for us. We also plan on adding a dbt test that alerts whenever the columns of any domain model table changes. This may indicate that an unexpected change has happened to an event schema, which could affect dashboards. diff --git a/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md b/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md index 67f217c76a4..ba5dddcae19 100644 --- a/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md +++ b/website/blog/2022-11-22-move-spreadsheets-to-your-dwh.md @@ -33,9 +33,9 @@ Let’s have a look at some of the offerings to help you get your spreadsheets i ## dbt seeds -dbt comes with an inbuilt csv loader ([seeds](https://docs.getdbt.com/docs/building-a-dbt-project/seeds)) to populate your data warehouse with any files you put inside of your project’s `seeds` folder. It will automatically infer data types from your file’s contents, but you can always override it by [providing explicit instructions in your dbt_project.yml](https://docs.getdbt.com/reference/resource-configs/column_types) file. +dbt comes with an inbuilt csv loader ([seeds](https://docs.getdbt.com/docs/build/seeds)) to populate your data warehouse with any files you put inside of your project’s `seeds` folder. It will automatically infer data types from your file’s contents, but you can always override it by [providing explicit instructions in your dbt_project.yml](https://docs.getdbt.com/reference/resource-configs/column_types) file. -However, since dbt creates these tables by inserting rows one at a time, it doesn’t perform well at scale (there’s no hard limit but aim for hundreds of rows rather than thousands). [The dbt docs](https://docs.getdbt.com/docs/building-a-dbt-project/seeds#faqs) suggest using seeds for “files that contain business-specific logic, for example, a list of country codes or user IDs of employees.” +However, since dbt creates these tables by inserting rows one at a time, it doesn’t perform well at scale (there’s no hard limit but aim for hundreds of rows rather than thousands). [The dbt docs](https://docs.getdbt.com/docs/build/seeds#faqs) suggest using seeds for “files that contain business-specific logic, for example, a list of country codes or user IDs of employees.” A big benefit of using seeds is that your file will be checked into source control, allowing you to easily see when the file was updated and retrieve deleted data if necessary. @@ -53,7 +53,7 @@ A big benefit of using seeds is that your file will be checked into source contr ## ETL tools -An obvious choice if you have data to load into your warehouse would be your existing [ETL tool](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) such as Fivetran or Stitch, which I'll dive into in this section. Below is a summary table highlighting the core benefits and drawbacks of certain ETL tooling options for getting spreadsheet data in your data warehouse. +An obvious choice if you have data to load into your warehouse would be your existing [ETL tool](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) such as Fivetran or Stitch, which I'll dive into in this section. Below is a summary table highlighting the core benefits and drawbacks of certain tooling options for getting spreadsheet data in your data warehouse. ### Summary table @@ -123,7 +123,7 @@ I’m a big fan of [Fivetran’s Google Drive connector](https://fivetran.com/do Like the Google Sheets connector, the data types of the columns are determined automatically. Dates, in particular, are finicky though—if you can control your input data, try to get it into [ISO 8601 format](https://xkcd.com/1179/) to minimize the amount of cleanup you have to do on the other side. -I used two macros in the dbt_utils package ([get_relations_by_pattern](https://github.com/dbt-labs/dbt-utils#get_relations_by_pattern-source) and [union_relations](https://github.com/dbt-labs/dbt-utils#union_relations-source)) to combine weekly exports from other tools into a single [model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models) for easy cleanup in a staging model. Make sure you grant your transformer account permission to access all tables in the schema (including future ones) to avoid having to manually intervene after every new file is uploaded. +I used two macros in the dbt_utils package ([get_relations_by_pattern](https://github.com/dbt-labs/dbt-utils#get_relations_by_pattern-source) and [union_relations](https://github.com/dbt-labs/dbt-utils#union_relations-source)) to combine weekly exports from other tools into a single [model](/docs/build/models) for easy cleanup in a staging model. Make sure you grant your transformer account permission to access all tables in the schema (including future ones) to avoid having to manually intervene after every new file is uploaded. #### Good fit for: diff --git a/website/blog/2022-11-30-dbt-project-evaluator.md b/website/blog/2022-11-30-dbt-project-evaluator.md index 0ab3c5d2b31..558d8877d72 100644 --- a/website/blog/2022-11-30-dbt-project-evaluator.md +++ b/website/blog/2022-11-30-dbt-project-evaluator.md @@ -120,4 +120,4 @@ If something isn’t working quite right or you have ideas for future functional Together, we can ensure that dbt projects across the galaxy are set up for success as they grow to infinity and beyond. - \ No newline at end of file + diff --git a/website/blog/2022-12-14-sams-foundry-experience.md b/website/blog/2022-12-14-sams-foundry-experience.md new file mode 100644 index 00000000000..c702d01c942 --- /dev/null +++ b/website/blog/2022-12-14-sams-foundry-experience.md @@ -0,0 +1,59 @@ +--- +title: "Making the leap from accountant to analytics engineer" +description: "The Foundry Program is an apprenticeship at dbt Labs designed to turn data newbies into fully-fledged analytics engineers over the course of six months. As one of the inaugural foundry apprentices, Sam shares his journey into the practice of analytics engineering." +slug: sam-foundry-experience + +authors: [sam_harting] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2022-12-15 +is_featured: true +--- + +In seventh grade, I decided it was time to pick a realistic career to work toward, and since I had an accountant in my life who I really looked up to, that is what I chose. Around ten years later, I finished my accounting degree with a minor in business information systems (a fancy way of saying I coded in C# for four or five classes). I passed my CPA exams quickly and became a CPA as soon as I hit the two-year experience requirement. I spent my first few years at a small firm completing tax returns but I didn't feel like I was learning enough, so I went to a larger firm right before the pandemic started. The factors that brought me to the point of changing industries are numerous, but I’ll try to keep it concise: the tax industry relies on underpaying its workers to maintain margins and prevent itself from being top-heavy, my future work as a manager was unappealing to me, and **my work was headed in a direction I wasn’t excited about**. + + + +I took on an initiative within the firm to use Alteryx to speed up some of our more labor-intensive but relatively consistent calculations. In the process of learning the software, I realized I **really** liked working with data and making processes more efficient. I mentioned this to my brother, David, and he said something to the extent of “the company I work for makes software that does that, and they are opening up an apprenticeship. You should apply.” David got me connected with a few people from the industry (including [Sung, a Solutions Architect](https://docs.getdbt.com/author/sung_chung) at dbt Labs), and I had the opportunity to poke their brains about what it is they did. After becoming increasingly more interested, I started taking a [SQL class on Udemy](https://www.udemy.com/course/the-complete-sql-bootcamp/) and applied for the apprenticeship. + +What ultimately motivated me to apply for the Foundry Program was the logical exploration involved in [data transformation](https://www.getdbt.com/analytics-engineering/transformation/). Rather than working backward from a predetermined solution (e.g. tax work), I was working forward from source data toward something I could create! Not to mention, everything I had heard about [the dbt Community](https://www.getdbt.com/community/join-the-community/), both internally and externally, seemed to be significantly more rewarding and satisfying than my at-the-time trajectory. + +## The Foundry journey + +> The [Foundry Program](https://www.getdbt.com/blog/announcing-the-foundry-program/) is an apprenticeship designed to turn data newbies into fully-fledged analytics engineers over the course of six months. As one of the inaugural Foundry apprentices, I’m here to share my journey into analytics engineering, along with the takeaways I picked up along the way. We’re continuing to improve the program with each iteration, but the curriculum for my cohort was split into two parts—three months of training followed by three months of hands-on work. + +### Getting started + +As I got started with the apprenticeship, did my expectations play out into reality? The answer to that is yes and no, but maybe not for reasons you expect. + +I was able to enjoy coding like I thought I would. I was definitely challenged logically, and I got to build things without a clear right answer in mind. In addition, the company and culture were better than I had anticipated. This manifested itself quickly when listening to the company talk about profits; of course, profits are important to dbt Labs, but they aren't the **only** [way we measure our success](https://github.com/dbt-labs/corp/blob/main/values.md). Spend five minutes in a CPA firm and you will find out how untrue that is for that industry. + +On the flip side, the most surprising part of the job was what I would actually be doing. Truthfully, I never really understood what “[analytics engineering](https://www.getdbt.com/what-is-analytics-engineering/)” meant throughout the interview process, so I was going in a bit blind. I knew SQL was involved, and I had read many dbt articles talking about Analytics Engineering, but none of that made a ton of sense to me without actually putting it into practice. I will always remember being so confused when I installed dbt for the first time and there was no icon on my computer. *Download complete!* Okay…. now… how do I open this? In a panic, I called my brother and asked where in the blazes the app was on my computer! + +I was also surprised by the number of tools and languages I had to learn; it was difficult and intimidating to approach the command line, git, YAML, and SQL all at the same time. At times, it caused imposter syndrome: the idea that I shouldn’t be here and somehow tricked my way into getting this apprenticeship. + +There would be these moments, however, when I would do something I couldn’t have done two months ago, or I held a conversation that would have sounded like a foreign language when I was working in tax. These moments helped put my imposter syndrome in the back seat. Not to mention, my managers and mentors were so supportive and great at giving positive feedback and constructive criticism. My manager, Jess, would constantly say “Can you believe in x number of months you are talking about this intelligently?” as a reminder that I was indeed learning *a lot*. + +The last stand-out bit about getting started was [Coalesce](https://coalesce.getdbt.com/). Never have I ever experienced a virtual event with such a strong sense of community. Of course, the speakers were great and I learned a lot throughout the week; there is no doubt about that. + +The thing that stood out to me the most was just *how excited* dbt users were about the product and community. Honestly, at times I forgot I was supposed to be listening because I was having such a great time just interacting with community members in Slack. **I think for someone new to dbt interacting with the community is *just* as important as learning the software**—the Community is the backbone of dbt; people had been saying it since I had started the apprenticeship, and in that week I got to see it come to life! + +### Client work and teaching + +By the time I was assigned to client projects, I was itching to learn by **doing**. The nerves and imposter syndrome were very real at first. Those feelings quickly dissipated as I got coaching from my development partners and was able to assist our clients. I started with the most wonderful clients who were grateful and praised our work. It was really reassuring that not only could I do this, but that it was something I wanted to do. Again, I got to tackle real-life problems and find the solutions myself. I got to learn what a development cycle looks like and how it might vary from customer to customer. I began learning certain BI tool intricacies like LookML, which was **incredibly** challenging at first, but eventually I started getting the hang of it. I went from learning, to learning *and* having opinions! “Wouldn’t it be great if this tool did this…” is a mark of someone that has moved beyond simply absorbing knowledge and has started to challenge it. + +While my work in the CPA world was a form of consulting, it never broached training. Teaching others how to use dbt Cloud was way out of my comfort zone at first, but I found it to be one of the best ways to learn the product. We got the opportunity to teach the fundamentals of dbt through [Group Training and Rapid Onboardings](https://www.getdbt.com/dbt-labs/services/). Our Director of Training, Kyle, walked us through what we would be teaching and some best practices, and gave us the opportunity to practice with real-time feedback. That feedback gave me the confidence boost I needed before my first attempt at training a real client. It took a few times of actually doing it for it to feel natural and tailor my style to something that worked, but with the help of my team members, I eventually felt pretty confident teaching! + +### Post-program + +Hey, I got hired! Woohoo! My experience since then has been genuinely wonderful. Of course, there have been times of difficulty or stress, but that is literally every job. I’ve had the opportunity to continue to work with different colleagues (spoiler alert, literally [every](https://docs.getdbt.com/author/lauren_benezra) [one](https://docs.getdbt.com/author/dave_connors) [of](https://docs.getdbt.com/author/wasila_quader) [them](https://docs.getdbt.com/author/grace_goheen) is someone I would love to grab a beer or coffee with). + +Most of the work I have done so far has been training others on dbt Cloud, and the work has had a lot more variety than I thought it would! Every project that I help while training is unique, every data team is distinctive, and every single client has taught me something awesome about dbt Cloud by asking good questions. + +As a full-time team member, my input is highly valued, even though I am new at analytics engineering. Even before I came on full-time, I offered my input and people would take it seriously. At times, they would even ask for it. But now that I am a full member of the team, I am actually getting assigned tasks that require a lot of personal input and managing the input of others. It feels so good to be immediately valued as a team member. I have never felt that I was a burden, or getting in the way of the Professional Services team. Rather, it has always felt like they are ready and excited to work with me. It is a small difference with a giant impact. + +Going forward I will be moving away from training (although not entirely!) and moving towards more hands-on consulting work. Our team continues to expand with more and more wonderful people. I have several goals going forward: getting Snowflake certified, getting [dbt certified](https://www.getdbt.com/dbt-certification/), and learning Python. Every single one of these goals is being encouraged by my managers and peers. I truly feel supported in this work and that as long as I stick with [these people](https://www.getdbt.com/blog/we-the-purple-people/), put in the effort, and keep an open mind, I will find my career in analytics engineering deeply fulfilling! + +The best part is: I’m not alone. I was lucky to have [Wasila Quader](https://docs.getdbt.com/author/wasila_quader) as a fellow Foundry Program apprentice. She was a constant source of support, knowledge, and camaraderie. Wasila also wrote about her experience which you can [read about here](https://docs.getdbt.com/blog/wasila-foundry-experience). diff --git a/website/blog/2023-01-17-grouping-data-tests.md b/website/blog/2023-01-17-grouping-data-tests.md new file mode 100644 index 00000000000..23fcce6d27e --- /dev/null +++ b/website/blog/2023-01-17-grouping-data-tests.md @@ -0,0 +1,93 @@ +--- +title: "Power up your data quality with grouped checks" +description: "Which of these numbers doesn't belong? [-1, 0, 1, null]. You can't judge data quality without data context, so our tools should enable as much context as possible." +slug: grouping-data-tests + +authors: [emily_riederer] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-01-17 +is_featured: true +--- +Imagine you were responsible for monitoring the safety of a subway system. Where would you begin? Most likely, you'd start by thinking about the key risks like collision or derailment, contemplate what causal factors like scheduling software and track conditions might contribute to bad outcomes, and institute processes and metrics to detect if those situations arose. What you wouldn't do is blindly apply irrelevant industry standards like testing for problems with the landing gear (great for planes, irrelevant for trains) or obsessively worry about low probability events like accidental teleportation before you'd locked down the fundamentals.  + +When thinking about real-world scenarios, we're naturally inclined to think about key risks and mechanistic causes. However, in the more abstract world of data, many of our data tests often gravitate towards one of two extremes: applying rote out-of-the-box tests (nulls, PK-FK relationships, etc.) from the world of traditional database management or playing with exciting new toys that promise to catch our wildest errors with anomaly detection and artificial intelligence.  + +Between these two extremes lies a gap where human intelligence goes. Analytics engineers can create more effective tests by embedding their understanding of how the data was created, and especially how this data can go awry (a topic I've [written about previously](https://emilyriederer.com/post/data-error-gen/)). While such expressive tests will be unique to our domain, modest tweaks to our mindset can help us implement them with our standard tools. This post demonstrates how the simple act of conducting tests _by group_ can expand the universe of possible tests, boost the sensitivity of the existing suite, and help keep our data "on track". This feature is [now available in dbt-utils](https://github.com/dbt-labs/dbt-utils#grouping-in-tests).  + + + +## Grouped checks + +Group-based checks can be important for fully articulating good "business rules" against which to assess data quality. For example, groups could reflect either computationally-relevant dimensions of the process (e.g. data loaded from different sources) or semantically-relevant dimensions of the real-world process that our data captures (e.g. repeated measures pertaining to many individual customers, patients, product lines, etc.) Such checks can make existing tests more rigorous while others are only expressible at the grouped level. + +### Only expressible +Some types of checks can only be expressed by group. For example, in a dataset containing train schedules across a transit system, an `ARRIVAL_TIME` field might not be unique; however, it would (hopefully) always be unique for a specific `TRACK` and `STATION`!  + +### More rigorous +Consider a recency check (i.e. that the maximum date represented in the data is appropriately close to the present); if the data loads from multiple sources (e.g. tickets purchases through web, a mobile app, or a station kiosk), a check of the maximum date could pass the check if any one source loaded, but unless the data is grouped by source and _each_ group's maximum date is checked, stale data could go undetected. + +## Case study: NYC subway data + +To demonstrate the utility (or, should I say, necessity) of group-level checks, let's consider some real-world open data from the [NYC subway system](http://web.mta.info/developers/turnstile.html) which I can always count on to have plenty of data quality quirks (which, to be clear, I do not say as a criticism; there's nothing unexpected about this in real-world "data as residue" data.). Cumulative entries through each turnstile across all subway stations are recorded 4x daily, creating a structure with one record for each turnstile and timestamp combination.  + +Of course, the information we want out of this data is probably not the cumulative count through some turnstile from some arbitrary start date but rather the station-level entries during a given period. So, in our transformations, we would take a lagged difference of the cumulative entries by turnstile and aggregate that up to the station-level. Just collating data from 5,000 sensors – what could go wrong, right?  + +However, that seemingly trivial lagged-difference transformation makes two key assumptions: the cumulative entries are _by turnstile_ and every time period observations is present _for every turnstile_. + +These conditions illustrate the two benefits of grouped checks we mentioned before: monotonicity can only be assessed after grouping by turnstile (there's no reason the cumulative entry count should only go up when comparing observations across different turnstiles), and although the presence of given timestamps _could_ be checked at the dataset level, it is substantially more rigorous when checked at the individual sensor level.  + +So what do we discover when we validate our data by group? + +Testing for monotonicity, we find many poorly behaved turnstiles. Unlike the well-behaved dark blue line, other turnstiles seem to _decrement_ versus _increment_ with each rotation while still others cyclically increase and plummet to zero – perhaps due to maintenance events, replacements, or glitches in communication with the central server. + + + +Similarly, while no expected timestamp is missing from the data altogether, a more rigorous test of timestamps _by turnstile_ reveals between roughly 50-100 missing observations for any given period. + + + +_Check out this [GitHub gist](https://gist.github.com/emilyriederer/4dcc6a05ea53c82db175e15f698a1fb6) to replicate these views locally._ + +## Right-sizing grouped checks + +If the power of grouped checks comes from our knowledge of the underlying systems, this same knowledge can guide our understanding of their limitations and when grouped checks aren't the right answer.  + +Just like we can't inspect every tie on our railroad track, grouped checks represent a tradeoff between effort (both cognitive and computational!) and value. They are most effective when groups are related to specific points of friction in our pipeline which we are unable to test or control what happens further upstream.  + +Not all groupings are equally likely to break the data. In the subway example, turnstile-level failures are likely because each individual turnstile is _independently_ involved in data collection and can break in its own unique ways. However, if we were working with clickstream data for our online ticket portal, the data collection process is centralized, so it would be unlikely for ETL to break in customer-specific ways and it would be cumbersome to execute checks by customer. + +Even when grouped checks have merit, their need might be a "code smell" that suggests we could instead be doing simpler checks further upstream. Since grouped checks are most often needed to counteract the blending of multiple data lineages, where possible they could be rewritten as more typical tests applied to each branch of the lineage before consolidation. For example, it would be nice if we could check for monotonicity before aggregating sensor data. However, when we lack control of those upstream processes, grouped checks offer a practical alternative. + +## Now in dbt-utils! + +If you're intrigued by the prospect of grouped checks, it's now possible to [run these tests from dbt-utils](https://github.com/dbt-labs/dbt-utils#grouping-in-tests). The 1.0.0 release [brings grouping in tests to all relevant tests](https://www.emilyriederer.com/post/grouping-data-quality-update/), specifically: + +- equal_rowcount() +- fewer_rows_than() +- recency() +- at_least_one() +- not_constant() +- sequential_values() +- non_null_proportion() + +Each check now has a `group_by_columns` argument which accepts one or more column names. For example, to check for a valid daily record for each turnstile in each station, we could add to our `schema.yml` file: + +```yaml +models: +  - name: turnstile_entries +    tests: + - dbt_utils.recency: +         datepart: day +          field: recorded_at +          interval: 1 + # Check for recency for each turnstile_id at each station_id +          group_by_columns: +            - station_id +            - turnstile_id +``` + +## Conclusion +And what should you do if your new data tests fail? This actually reveals the final benefit of hypothesis-driven checks: because you are testing for the failure of specific systems or processes, test results will direct your debugging attention towards the root cause of your data issue! Instead of embarking on a directionless quest, you'll immediately know where in your pipeline to focus your attention to get your system back on track. \ No newline at end of file diff --git a/website/blog/2023-01-24-aggregating-test-failures.md b/website/blog/2023-01-24-aggregating-test-failures.md new file mode 100644 index 00000000000..d82c202b376 --- /dev/null +++ b/website/blog/2023-01-24-aggregating-test-failures.md @@ -0,0 +1,134 @@ +--- +title: "How we cut our tests by 80% while increasing data quality: the power of aggregating test failures in dbt" +description: "A singular data quality test just failed...whew things are still ok — when *many* dbt tests fail, how do you make those failures actionable? Noah explores how aggregating test failures in dbt led to more informative, actionable, and self-service testing initiatives for end data users." +slug: aggregating-test-failures-with-dbt + +authors: [noah_kennedy] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-01-24 +is_featured: true +--- + +Testing the quality of data in your warehouse is an important aspect in any mature data pipeline. One of the biggest blockers for developing a successful data quality pipeline is aggregating test failures and successes in an informational and actionable way. However, ensuring actionability can be challenging. If ignored, test failures can clog up a pipeline and create unactionable noise, rendering your testing infrastructure ineffective. + + + +At [Tempus](https://www.tempus.com/), a precision medicine company specializing in oncology, high quality data is a necessary component for high quality clinical models. With roughly 1,000 dbt models, nearly a hundred data sources, and a dozen different data quality stakeholders, producing a framework that allows stakeholders to take action on test failures is challenging. Without an actionable framework, data quality tests can backfire — in early 2022, we had nearly a thousand tests, hundreds of which failed on a daily basis yet were wholly ignored. + +Recently, we overhauled our testing framework. We cut the number of tests down to 200, creating a more mature framework that includes metadata and emphasizes actionability. Our system for managing data quality is a three step process, described below: + +1. Leveraging the contextual knowledge of stakeholders, writing specific, high quality data tests, perpetuating test failure results into aliased models for easy access. +1. Aggregating test failure results using Jinja macros and pre-configured metadata to pull together high level summary tables. +1. Building views on top of the base table to split tests by owner or severity, and creating visualizations using our tool of choice. + +_It should be noted that this framework is for dbt v1.0+ on BigQuery. Small adaptations are likely necessary to make this framework run on a different setup._ + +## Specific, high quality data tests + +When we talk about high quality data tests, we aren’t just referencing high quality code, but rather the informational quality of our testing framework and their corresponding error messages. Originally, we theorized that any test that cannot be acted upon is a test that should not be implemented. Later, we realized there is a time and place for tests that should receive attention at a critical mass of failures. All we needed was a higher specificity system: tests should have an explicit severity ranking associated with them, equipped to filter out the noise of common, but low concern, failures. Each test should also mesh into established [RACI](https://project-management.com/understanding-responsibility-assignment-matrix-raci-matrix/) guidelines that state which groups tackle what failures, and what constitutes a critical mass. + +To ensure that tests are always acted upon, we implement tests differently depending on the user groups that must act when a test fails. This led us to have two main classes of tests — Data Integrity Tests (called [Generic Tests](https://docs.getdbt.com/docs/build/tests) in dbt docs) and Context Driven Tests (called [Singular Tests](https://docs.getdbt.com/docs/build/tests#singular-tests) in dbt docs), with varying levels of severity across both test classes. + +Data Integrity tests (Generic Tests)  are simple — they’re tests akin to a uniqueness check or not null constraint. These tests are usually actionable by the data platform team rather than subject matter experts. We define Data Integrity tests in our YAML files, similar to how they are [outlined by dbt’s documentation on generic tests](https://docs.getdbt.com/docs/build/tests). They look something like this — + +```yaml +version: 2 +models: + - name: patient + columns: + - name: id + description: Unique ID associated with the record + tests: + - unique: + alias: patient__id__unique + - not_null: + alias: patient__id__not_null +``` +
Example Data Integrity Tests in a YAML file — the alias argument is an important piece that will be touched on later.

+ +Context Driven Tests are more complex and look a lot more like models. Essentially, they’re data models that select bad data or records we don’t want, defined as SQL files that live in the `dbt/tests` directory. An example is shown below — + +```sql +{{ config( + tags=['check_birth_date_in_range', 'patient'], + alias='ad_hoc__check_birth-date_in_range' + ) +}} + +SELECT + id, + birth_date +FROM + {{ ref('patient') }} +WHERE birth_date < '1900-01-01' +``` +
The above test selects all patients with a birth date before 1900, due to data rules we have about maximum patient age.

+ +Importantly, we leverage [Test Aliasing](https://docs.getdbt.com/reference/resource-configs/alias) to ensure that our tests all follow a standard and predictable naming convention; our naming convention for Data Integrity tests is *table_name_ _column_name__test_name*, and our naming convention for Context Driven Tests is *ad_hoc__test_name*. Finally, to ensure all of our tests can then be aggregated, we modify the `dbt_project.yml` file  and [set the `store_failures` tag to ‘TRUE’](https://docs.getdbt.com/reference/resource-configs/store_failures), thus persisting test failures into SQL tables. + +At this point in development, we have Data Integrity Tests defined in the YAML and Context Driven Tests defined as SQL files. Tests are specific, actionable, and realistic, and each comes with an idea of severity, and a group of users who care when it fails. All of our tests are aliased according to a specific naming convention so that we know the table names they will put data into, and we have modified our dbt project config to set `store_failures` true for all tests. + +## Test aggregation using metadata + +Our next step is to define test metadata for each of our tests. The reason for this is twofold. First, we want to ensure that in later visualization steps, we can attach a description and a more human-readable name to the test. Second, having a metadata file allows us to attach all sorts of extra information to tests: who owns the test, how severe it is, and if the test is active or inactive, just to name a few. + +Our metadata is stored in a [seed file](https://docs.getdbt.com/docs/build/seeds). The only required field here is the `test_alias`, which acts as a to link the metadata to the name of the test failures table. We also include the test severity, the test owner, a test description, and several other fields that act as filters for future aggregation tables. + +After defining our metadata Seed file, we begin the process of aggregating our data. We aggregate our data by defining a base model that joins our test failure results (now stored in a separate schema) with the metadata we defined. Below is an example of what that code looks like — + +```sql +{{ config( + materialized = 'incremental', + partition_by = {'field': 'load_date', 'data_type': 'date'}, + incremental_strategy = 'merge', + unique_key='row_key', + full_refresh=false, + tags=['dq_test_warning_failures','clinical_mart', 'data_health'] + ) +}} + +WITH failures as ( + SELECT + count(*) as test_failures, + _TABLE_SUFFIX as table_suffix, + FROM {{ var('clinical_mart_schema') }}_dbt_test__audit.`*` + GROUP BY _TABLE_SUFFIX +), + +metadata as ( + SELECT + test_owner, + test_alias, + test_description, + split(test_alias, '__')[SAFE_ORDINAL(2)] as test_name, + test_severity + FROM {{ref('test_warning_metadata')}} +), + +SELECT + m.*, + f.* +FROM metadata m +LEFT JOIN failures f on m.test_alias = f.table_suffix +WHERE m.is_active is TRUE +``` +
Example Metadata + Test Failure Aggregation Base Model.

+ +Some key components: + +- We materialize our base model as incremental, set `full_refresh` to *false* within the `dbt_project.yml`, and partition our table by date to ensure that we keep historical data. +- We use BigQuery, which allows [wild card selectors](https://cloud.google.com/bigquery/docs/querying-wildcard-tables) and makes our life much easier. If you’re using a different framework, you most likely need to write a loop using Jinja. +- Since we have an expected naming convention, we can split the `test_alias` to get components like table name or column name if we desire. + +Now that our base model is developed, we have a central point of truth that aggregates all of our data tests into one location, complete with metadata that gives more insight into the test, as well as who owns it. Our final step is leveraging our base table to gain added insights from our tests. + +## Finishing touches and conclusions + +With our finalized data quality base table, there are many other options for cleaning up our framework or creating visualizations. Our team uses the base table in a few main ways. + +First, we create views on top of the base table that filter down by test owner. We strongly believe that test noise is the biggest risk towards the success of a quality framework. Creating specific views is like giving each team a magnifying glass that lets them zoom into only the tests they care about. We also have a dashboard, currently in Google Looker Studio, that shows historical test failures with a suite of filters to let users magnify high severity tests and constructs machine-composed example queries for users to select failing records. When a test fails, a business analyst can copy and paste a query from the dashboard and get all the relevant information. + +As with any framework, it’s always a work in progress — we still encounter issues with noise in our tests, and still struggle to wrangle our users to care when a test fails. However, we’ve found that this data framework works exceptionally well at enabling data users to create and deploy their own tests. All they need to do is submit a pull request with SQL code that flags bad data, and write one line of metadata. \ No newline at end of file diff --git a/website/blog/2023-02-01-ingestion-time-partitioning-bigquery.md b/website/blog/2023-02-01-ingestion-time-partitioning-bigquery.md new file mode 100644 index 00000000000..99ce142d5ed --- /dev/null +++ b/website/blog/2023-02-01-ingestion-time-partitioning-bigquery.md @@ -0,0 +1,197 @@ +--- +title: "BigQuery ingestion-time partitioning and partition copy with dbt" +description: "How one data team saved significant BigQuery run time (and therefore $$) by building ingestion-time partitioning support to dbt's incremental model materialization." +slug: bigquery-ingestion-time-partitioning-and-partition-copy-with-dbt +canonical_url: https://medium.com/teads-engineering/bigquery-ingestion-time-partitioning-and-partition-copy-with-dbt-cc8a00f373e3 + +authors: [christophe_oudar] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-03-10 +is_featured: true +--- + +At Teads, we’ve been using BigQuery (BQ) to build our analytics stack since 2017. As presented in a previous [article](https://medium.com/teads-engineering/give-meaning-to-100-billion-analytics-events-a-day-d6ba09aa8f44), we have designed pipelines that use multiple roll-ups that are aggregated in data marts. Most of them revolve around time series, and therefore time-based partitioning is often the most appropriate approach. + + + +Back then, only ingestion-time partitioning was available on BQ and only at a daily level. Other levels required to work with sharded tables. It’s still the case if we consider the partition limit set at 4096 when we’re using hourly partitions, since it translates to roughly 170 days. + +We built an internal SQL query executor tool to wrap the execution of our BigQuery jobs while dbt Labs (formerly known as Fishtown Analytics) was creating its own product: dbt. After a successful experiment in 2021, dbt is now part of our go-to solution to create new BigQuery jobs at Teads. Though it misses a few custom features, it has become a superset of our former tool for everyday usage. + +As column partitioning was released on BQ, and dbt favored [incremental materialization](/docs/build/incremental-models), we identified one case that wasn’t well supported: [ingestion-time partitioned tables using incremental materialization](https://github.com/dbt-labs/dbt-bigquery/issues/75). + +🎉 **We’ve been drafting a technical solution since the end of 2021 and finally managed to [merge our contribution](https://github.com/dbt-labs/dbt-bigquery/pull/136) during Coalesce 2022!** + +## When to use ingestion-time partitioning tables + +Ingestion-time partitioning tables are very similar to column-type partitioning with `TIMESTAMP` columns. We can actually replicate most of the behavior from each other. + +Let’s see the main differences brought by ingestion-time partitioning tables: + +- In ingestion-time partitioning tables, we have a `TIMESTAMP` pseudo column called `_PARTITIONTIME`. This is not taken into account in the table’s weight which is interesting, so if you have a lot of rows, it can be worth it. You can also request `_PARTITIONDATE` which contains the same data truncated at the day-level with a `DATE` type. +- Selecting data from ingestion-time partitioning tables that include a pseudocolumn is also cheaper because the column is not billed. We also figured out that queries filtering on time partition columns are faster on ingestion-time partitioning tables regarding slot time. So whether we’re using “pay as we go” or “flat rate”, we’re better off with ingestion-time partitioning tables regarding performance. +- If we need to insert into multiple time partitions in a load/insert, we have to use column-type partitioning. Yet you can use a merge to insert in multiple partitions with ingestion-time partitioning tables. +- We can’t select the pseudocolumn as is for some operations such as a `GROUP BY` and it must be renamed. Practically the column name is restricted and we have to alias it to something else. +- We cannot use a `CREATE TABLE … AS SELECT …` on ingestion-time partitioning tables; it’s one of the main reasons why dbt didn’t support them at first with incremental materialization. It requires creating the table using a `PARTITION BY` clause and then inserting the data. + +As a rule of thumb, you can consider that if your table partition length is less than a 1 million rows, you’re better off using column-type partitioning. + +## How to use ingestion-time partitioning in dbt + +> The following requires dbt bigquery v1.4+ + +When we designed ingestion partitioning table support with the dbt Labs team, we focused on ease of use and how to have seamless integration with incremental materialization. + +One of the great features of incremental materialization is to be able to proceed with a full refresh. We added support for that feature and, luckily, `MERGE` statements are working as intended for ingestion-time partitioning tables. This is also the approach used by the [dbt BigQuery connector](/docs/core/connect-data-platform/bigquery-setup). + +The complexity is hidden in the connector and it’s very intuitive to use. For example, if you have a model with the following SQL: + +```sql +{{ config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by = { + "field": "day", + "data_type": "date" + } +) }} + +select + day, + campaign_id, + NULLIF(COUNTIF(action = 'impression'), 0) impressions_count +from {{ source('logs', 'tracking_events') }} +``` + +We only need to add a field to move to ingestion-time partitioning: `"time_ingestion_partitioning": true` + +```sql +{{ config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by = { + "field": "day", + "data_type": "date", + "time_ingestion_partitioning": true + } +) }} + +select + day, + campaign_id, + NULLIF(COUNTIF(action = 'impression'), 0) impressions_count +from {{ source('logs', 'tracking_events') }} +``` + +The resulting table schema will be: + +```yaml +campaign_id INT64 +impressions_count INT64 +``` + +Indeed the day column data will be inserted into the `_PARTITIONTIME` pseudo column which is not visible in the table schema. Underneath, dbt generates a `MERGE` statement that wraps the insertion in the table. It’s very convenient when our model output contains multiple partitions and/or your incremental strategy is `incremental_overwrite`. + +### MERGE statements and performance + +However, if you need to insert or overwrite a single partition, for instance, with an hourly/daily rollup, then writing on an explicit partition is much more efficient than a `MERGE`. + +We had a job with millions of rows on which we compared both approaches and measured: + +- 43 minutes with a `MERGE` approach using dbt +- 26 minutes with a custom query using `WRITE_TRUNCATE` on the destination table using a partition decorator + +That’s a 17 minutes difference which means that almost 40% of the `MERGE` statement is spent on adding the data to the table. + +Of course, the `MERGE` statement offers much more flexibility than a `WRITE_TRUNCATE` query. Yet in most analytics workload cases, the queries are time series that are immutable - and therefore, either the destination partition is empty or we’ll likely have to reprocess a partial period so that it translates into overwriting every row in a subset of the existing partitions. + +### Efficient solution + +The dbt approach to insert/overwrite incremental partitions using `insert_overwrite` without using static partitions is the following: + +- Create a temporary table using the model query +- Apply the schema change based on the `on_schema_change` configuration +- Use a `MERGE` statement to insert the data from temporary table into the destination one + +If we want to get rid of the `MERGE` statement, there are 2 solutions: + +- Use a `SELECT` statement over the data of a partition from the temporary table and use the partition decorator on the destination table to output the data using `WRITE_TRUNCATE` +- Copy every partition with overwrite from using BigQuery driver + +In both cases, the operation can be done on a single partition at a time so it requires a tradeoff between speed and model atomicity if multiple partitions are involved. + +On a 192 GB partition here is how the different methods compare: + + + +Also, the `SELECT` statement consumed more than 10 hours of slot time while `MERGE` statement took days of slot time. + +So picking the BQ copy approach is definitely a no-brainer. That’s the solution we picked to improve the BQ output on incremental materialization using the `insert_overwrite` strategy. + +Though it looks like a silver bullet, there are cases where we DON’T want to use it: + +- If we have a small partition, merging on a small table, the gains are negligible +- If a lot of partitions are involved, the copy will happen sequentially. It could be parallelized in another update but depending on how many concurrent operations would be configured, the performance might still not improve enough over a `MERGE` statement. +- If you need consistency across multiple partitions replacement, this approach will not fit your needs as all partitions are not replaced atomically. + +## How to use partition copy with dbt + +> The following requires dbt bigquery v1.4+ + +To move a model to use partition copy instead of a `MERGE` statement, let’s take the same model as previously: + +```sql +{{ config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by = { + "field": "day", + "data_type": "date" + } +) }} + +select + day, + campaign_id, + NULLIF(COUNTIF(action = 'impression'), 0) impressions_count +from {{ source('logs', 'tracking_events') }} +``` + +Again we only need to add a field to move to partition copy: `"copy_partitions": true` + +```sql +{{ config( + materialized = 'incremental', + incremental_strategy = 'insert_overwrite', + partition_by = { + "field": "day", + "data_type": "date", + "copy_partitions": true + } +) }} + +select + day, + campaign_id, + NULLIF(COUNTIF(action = 'impression'), 0) impressions_count +from {{ source('logs', 'tracking_events') }} +``` + +The configuration will be read at run time and will use the BQ driver integration to write the data using partition copy. The integration should be seamless. + +## Conclusion + +**Combining ingestion-time partitioning and partition copy is a great way to achieve better performance for your models**. Of course, it would have been simpler if both features were fully integrated with SQL and didn’t require work around BigQuery Data Definition Language SQL or driver usage. + +But thanks to dbt’s open-source approach and dbt Labs team, **we had the opportunity to add support for those use cases** and bring it to more BigQuery users. + +Lastly, I wanted to share [Jeremy Cohen’s post](https://discourse.getdbt.com/t/bigquery-dbt-incremental-changes/982) which is giving great insights to **figure out how to pick an incremental strategy** and its options depending on your needs. + +--- + +**If you love working with data at scale and look for a new challenge**, have a look at our [engineering job opportunities](https://engineering.teads.com/jobs/) at Teads. + +🎁 If this article was of interest, you might want to have a look at [BQ Booster](https://bqbooster.kayrnt.fr/), a platform I’m building to help BigQuery users improve their day-to-day. diff --git a/website/blog/2023-02-14-passing-the-dbt-certification-exam.md b/website/blog/2023-02-14-passing-the-dbt-certification-exam.md new file mode 100644 index 00000000000..dbd0b856fe9 --- /dev/null +++ b/website/blog/2023-02-14-passing-the-dbt-certification-exam.md @@ -0,0 +1,78 @@ +--- +title: Tips and advice to study for, and pass, the dbt Certification exam +description: So you want to make it *official*? Callie and Jade, analytics engineering at Montreal Analytics, share their advice on passing the dbt Certification Exam. +slug: tips-for-the-dbt-certification-exam +authors: [callie_white, jade_milaney] +tags: [analytics craft] +hide_table_of_contents: false +date: 2023-02-16 +is_featured: true +--- +The [new dbt Certification Program](https://www.getdbt.com/blog/dbt-certification-program) has been created by dbt Labs to codify the data development best practices that enable safe, confident, and impactful use of dbt. Taking the Certification allows dbt users to get recognized for the skills they’ve honed, and stand out to organizations seeking dbt expertise. + +Over the last few months, [Montreal Analytics](https://www.montrealanalytics.com/), a full-stack data consultancy servicing organizations across North America, has had over 25 dbt Analytics Engineers become certified, earning them the 2022 dbt Platinum Certification award. + +In this article, two Montreal Analytics consultants, Jade and Callie, discuss their experience in taking, and passing, the dbt Certification exam to help guide others looking to study for, and pass the exam. + + +## What brought you to the exam? + +**Jade (J):** I’m a newly minted Montreal Analytics Analytics Engineer coming from a background as a data analyst with a few months of dbt experience. My last company used some basic dbt in their BI implementation, not so far as tests or snapshots, but I have some exposure(s) (pun intended). I decided to take the exam to build up my knowledge as a Junior Analytics Engineer and to give future clients confidence in my skills. + +**Callie (C):** As an experienced Analytics Engineer with 4 years of dbt experience, dating back to late 2018, 5 years of experience across the Modern Data Stack, and an overall 6 years of a career spent in data, I approached the new dbt test with an ‘old school’ dbt repertoire and years of ingrained bad habits. I have been through the whole evolution of dbt’s growth, and so I wanted to take the exam to polish up my skills, showcase Montreal Analytics’ dbt expertise, and earn that shiny badge for my LinkedIn profile. + +## How did you prepare for the exam? + +**J:** To prepare for the exam, I built up a practice dbt project. All consultants do this as part of Montreal Analytics onboarding process, and this project allowed me to practice implementing sources and tests, refactoring SQL models, and debugging plenty of error messages. Additionally, I reviewed the [Certification Study Guide](https://www.getdbt.com/assets/uploads/dbt_certificate_study_guide.pdf) and attended group learning sessions. + +**C:** To prepare for the exam I reviewed the official dbt Certification Study Guide and the [official dbt docs](https://docs.getdbt.com/), and attended group study and learning sessions that were hosted by Montreal Analytics for all employees interested in taking the exam. As a group, we prioritized subjects that we felt less familiar with; for the first cohort of test takers this was mainly newer topics that haven’t yet become integral to a typical dbt project, such as [doc blocks](https://docs.getdbt.com/docs/collaborate/documentation#using-docs-blocks) and [configurations versus properties](https://docs.getdbt.com/reference/configs-and-properties). These sessions mainly covered the highlights and common “gotchas” that are experienced using these techniques. The sessions were moderated by a team member who had already successfully completed the dbt Certification, but operated in a very collaborative environment, so everyone could provide additional information, ask questions to the group, and provide feedback to other members of our certification taking group. + +I felt comfortable with the breadth of my dbt knowledge and had familiarity with most topics. However in my day-to-day implementation, I am often reliant on documentation or copying and pasting specific configurations in order to get the correct settings. Therefore, my focus was on memorizing important criteria for *how to use* certain features, particularly on the order/nesting of how the key YAML files are configured (dbt_project.yml, table.yml, source.yml). + +## How did the test go? + +**J:** With a cup of coffee I started my exam in high spirits and high stress. I had never taken a proctored exam before, so going into this I had to adjust to being on camera while taking a test and in general taking a test in my living room felt strange! + +The first few questions were trickier than I’d anticipated, and my heart started beating a little faster as a result. I found the build-list questions, five lines of code to create a valid YAML or SQL file that accomplishes a certain task, particularly difficult. + +The exam consists of 65 questions, usually containing multiple parts, so by 90 minutes in I started to get tired. I’d flagged several questions and went back to check on those before submitting. At the time, I thought I answered about 60% of these questions right? Having lost my coffee buzz and with shaky confidence I submitted the test to see my result. Failed. + +**C:** In advance of test day I did spend extra time making sure that I had an appropriate environment for taking the test; I booked a study room at my local library that had absolutely nothing on the walls, was completely quiet, and had a solid internet connection so that I wouldn’t have to be stressed about any details on that front. + +I had been informed of the discrete option multiple choice questions, but those threw me off and shook my confidence. The total “major” question (ie. Question 5) was composed of a certain number of smaller sub-questions (ie. Question 5a-e). If you answered the first sub-question correctly, it served you the second sub-question. If you answered the second sub-question correctly, it moved to the third sub-question. If you got any of the sub-questions incorrect the page navigated to the next major question and you lost the point for the whole major question. There was no indication for how many sub-questions were in each major question, therefore regardless of how many questions I answered correctly, I never knew if it moved to the next major question because I was wrong, or because I had answered all the questions. + +Otherwise, the questions required considerable focus to flush out the differences between them as there were often only small differences between the answer options. However, once I identified what the differences were between answer options I felt that it was pretty easy for me to choose the most correct option. + +The questions that personally tripped me were things that I had expected to trip me up. These had the common theme of being technical terminology for specific techniques that I know how to use, but couldn’t speak the same official language that the test was using. An example is distinguishing slowly-changing data type(s) and their implementation in dbt. + +Additionally, I got tripped up on the official names for test types — what was previously known as “schema/data tests” are now referred to as “generic” or “singular tests”. dbt Labs changed their naming conventions during the dbt 1.0.0 upgrade, so I was more familiar with the [old naming conventions](https://docs.getdbt.com/guides/legacy/writing-custom-generic-tests). + +## Thoughts after taking the test? + +**J:** Personal experience has taught me that the first thing to do when you’ve failed a test is to get yourself a treat. I took myself to my favorite bakery; got a walk in and a Boston Cream donut. Now I could properly reflect on how the test went, and prepare for round two. + +**C:** After the test I immediately turned Slack back on, answered client questions, and got back to work. Check, done, and moving on. + +The test felt as I had expected; I knew I hadn’t aced it because I had prioritized my actual dbt work over memorizing for a test I only needed 65% to pass. I was confident that my practical experience, along with brushing up on some specifics would get me a pass. + +A huge proportion of a role in data is weighing the cost versus benefit of a task. Does the value of the insight gained from the data justify the effort and cost of modeling that data? What’s the acceptable error threshold? What’s the priority in comparison to other tasks that could be done? I applied the same principle to completing my dbt Certification. + +## Retaking the dbt Certification exam + +**J:** Getting ready for my second attempt, I leaned heavily on the dbt documentation and reference pages; I focused on debugging errors and schema generation, but the goal here was mostly to build up my confidence. A colleague and I built flashcards (decks [1](https://quizlet.com/ca/718959401/dbt-study-terms-and-practice-qs-flash-cards/) and [2](https://quizlet.com/ca/720366359/dbt-certification-prep-2-flash-cards/)) to help with this, working on recall of commands, error types, configurations, and properties. The extra hours and flash cards combined with pep talks from my team members had me feeling ready to go. + +Test taking is a skill. As Callie says, "Academic-type test taking is its own skill that has huge emotional factors and has not been made for the way everyone’s brain works best." + +With a deep breath, and a big cup of water — it’s 2:30pm two weeks later and I’m retaking the exam. Same set up as before with some tricky questions, especially the build list ones. I’m thankful I studied debugging errors and schemas because that came up a few times. I answer the questions confidently and flag 12 to look at again before I submit the exam at 3:45pm. +Passed. + +Now, the first thing you must do when you’ve passed a test is to get yourself a treat — it’s Boston Cream time! + +## Now that you’re dbt official, where to next? + +**C:** I’m continuing on the same path, but with the shiny badge prominently posted on my LinkedIn profile along with the certifications of my teammates. The validation of having an external party verify that I know the key best practices in the primary data tool I use and that I can be trusted to implement a successful dbt project is great. + +Standards and best practices are very important, but a test is a measure at a single point in time of a rapidly evolving industry. It’s also a measure of my test-taking abilities, my stress levels, and other things unrelated to my skill in data modeling; I wouldn’t be a good analyst if I didn’t recognize the faults of a measurement. I’m glad to have this check mark completed, but I will continue to stay up to date with changes, learn new data skills and techniques, and find ways to continue being a holistically helpful teammate to my colleagues and clients. + + +You can learn more about the dbt Certification [here](https://www.getdbt.com/blog/dbt-certification-program/). \ No newline at end of file diff --git a/website/blog/2023-03-23-audit-helper.md b/website/blog/2023-03-23-audit-helper.md new file mode 100644 index 00000000000..8599ad5eb5d --- /dev/null +++ b/website/blog/2023-03-23-audit-helper.md @@ -0,0 +1,261 @@ +--- +title: "Audit_helper in dbt: Bringing data auditing to a higher level" +description: Migrations from stored procedures to dbt become easier with the audit_helper package. The team at Indicium Tech breaksdown the how and why of using this package during your dbt migration. +authors: [arthur_marcon, lucas_bergodias, christian_vanbellen] +slug: audit-helper-for-migration +tags: [analytics craft] +hide_table_of_contents: false +date: 2023-03-24 +is_featured: true +--- + +Auditing tables is a major part of analytics engineers’ daily tasks, especially when refactoring tables that were built using SQL Stored Procedures or Alteryx Workflows. In this article, we present how [the `audit_helper` package](https://github.com/dbt-labs/dbt-audit-helper) can (as the name suggests) help the table auditing process to make sure a refactored model provides (pretty much) the same output as the original one, based on our experience using this package to support our clients at Indicium Tech®. + + + +## Introduction + +It is common for analytics engineers (AE) and data analysts to have to refactor (or translate) SQLServer® Stored Procedures, Alteryx Workflows® or other modeling tools into dbt models, or even refactor a dbt model to update its data sources. Also, many times, some business rules are applied in different models (and tools), and we as AEs may need to make sure that outputs match. However, ensuring that the values in the original table and in the refactored one match used to be a hard task that involved a lot of manual coding and some generalistic tests (such as counting the amount of rows or summing all values in a column). + +Not only is that approach time-consuming, but it is also prone to naive assumptions that values match based on aggregate measures (such as counts or sums). To provide a better, more accurate approach to auditing, dbt Labs has created the `audit_helper` package. `audit_helper` is a package for dbt whose main purpose is to audit data by comparing two tables (the original one versus a refactored model). It uses a simple and intuitive query structure that enables quickly comparing tables based on the column values, row amount, and even column types (for example, to make sure that a given column is numeric in both your table and the original one). Figure 1 graphically displays the workflow and where `audit_helper` is positioned in the refactoring process. + + + +Now that it is clear where the `audit_helper` package is positioned in the refactoring process, it is important to highlight the benefits of using audit_helper (and ultimately, of auditing refactored models). Among the benefits, we can mention: +- **Quality assurance**: Assert that a refactored model is reaching the same output as the original model that is being refactored. +- **Easy and intuitive code**: Because audit_helper relies on dbt macros, it was designed to be an intuitive tool that runs on simple SQL queries. +- **Clear output**: Audit_helper provides clear output showing how much of your refactored table matches the original one. +- **Visibility to a project**: Audit_helper gives visibility to a refactoring process by showing how your code is delivering the same results in both row-wise and column-wise comparisons. +- **Flexibility to compare rows and columns**: It is simple to quickly compare the results in rows or columns through pre-made templates that just require you to place your columns’ names and the original model’s ones. + +Next, we provide instructions on how to install audit_helper in your dbt project, how to run row comparison (with the `compare_queries` macro), column comparison (with the `compare_column_values`) and provide some tips for `audit_helper` usage. + +## Installation instructions +Let’s start by setting the stage for the audit_helper package to work in our project. It's important to note that the `audit_helper` package must run on dbt versions above or equal to 1.2.0 and below 2.0.0. Next, the two steps below will guide you on how to install and get `audit_helper` up and running. + +1. First, add a `packages.yml` file to your dbt project, if you don't have one already. Inside the `packages.yml` file, add the `audit_helper` package by copying and pasting the code below. Note that this file should be at the same level as your `dbt_project.yml` file. + + ```yaml + packages: + - package: dbt-labs/audit_helper + version: 0.7.0 + ``` + +2. Run `dbt deps` in the command line to install the package(s). Packages get installed in the `dbt_packages` directory — by default this directory is ignored by git, to avoid duplicating the source code for the package.For more information on using packages in your dbt project, check out [the dbt Documentation](https://docs.getdbt.com/docs/build/packages). + +Now that `audit_helper` is installed, let’s talk about its two main macros: +- `compare_queries` — to audit rows +- `compare_column_values` — to audit values in columns. + +## Audit rows (`compare_queries`) + +According to the `audit_helper` package documentation, this macro comes in handy when: +- You need to filter out records from one of the relations, +- Some columns must be renamed or recast in order to match, +- But you only want to compare a few columns, since it’s simpler to write the columns you want to compare rather than the columns you want to exclude. + +### How it works +When you run the dbt audit model, it will compare all columns, row by row. To count for the match, every column in a row from one source must exactly match a row from another source, as illustrated in the example in Figure 2 below: + + + + +As shown in the example, the model is compared line by line, and in this case, all lines in both models are equivalent and the result should be 100%. Figure 3 below depicts a row in which two of the three columns are equal and only the last column of row 1 has divergent values. In this case, despite the fact that most of row 1 is identical, that row will not be counted towards the final result. In this example, only row 2 and row 3 are valid, yielding a 66.6% match in the total of analyzed rows. + + + +As previously stated, for the match to be valid, all column values of a model’s row must be equal to the other model. This is why we sometimes need to exclude columns from the comparison (such as date columns, which can have a time zone difference from the original model to the refactored — we will discuss tips like these below). + +### The code for the `compare_queries` macro: Step-by-step +1. Create a new `.sql` model in the folder of your choice +2. Copy and paste the following example below in the model created: + +```sql +{# in dbt Develop #} + + +{% set old_fct_orders_query %} +select + id as order_id, + amount, + customer_id +from old_etl_schema.fct_orders +{% endset %} + + +{% set new_fct_orders_query %} +select + order_id, + amount, + customer_id +from {{ ref('fct_orders') }} +{% endset %} + + +{{ audit_helper.compare_queries( + a_query=old_fct_orders_query, + b_query=new_fct_orders_query, + primary_key="order_id" +) }} +``` +Let’s understand the arguments used in the `compare_queries` macro: +- `primary_key` (optional): The macro accepts a primary key argument to join the results of the queries together, but if the compared queries lack one, you can create one or simply remove that line of code. Because this type of comparison evaluates all values in a row, it will not show any difference if a primary key is not specified. +- `summarize` (optional): This argument allows you to switch between a summary or detailed (verbose) view of the compared data. This argument accepts true or false values (its default is set to be true). + +3. Replace the sources from the example with your own + + + As illustrated in Figure 4, using the `ref` statements allows you to easily refer to your development model, and using the full path makes it easy to refer to the original table (which will be useful when you are refactoring a SQL Server Stored Procedure or Alteryx Workflow that is already being materialized in the data warehouse). + +4. Specify your comparison columns + + + Delete the example columns and replace them with the columns of your models, exactly as they are written in each model. You should rename/alias the columns to match, as well as ensuring they are in the same order within the `select` clauses. + + When there are a lot of columns in the data set, start with a few of them (say 5 columns) and run; when you get a good match, add more columns and run again. You can also easily comment the columns that you don’t want to compare! + + Commonly, when comparing many columns at once and getting a bad result, it is difficult to know which column the values are different from. However, as seen above, it is enough that the values are different in one of the columns to invalidate the line as a whole. As a result, we strongly advise you to start with a few columns and gradually add more columns as you get a good match. When a new column causes a drop in the match, examine the column in both models. + +5. Run the audit model as you would run any other dbt model using the command below: + + `dbt run --select ` + +6. Check the result by copying and pasting the code below into your development environment: + + ```sql + select * from + + -- or select * from {{ ref('your_audit_model') }} if you're in the dbt Cloud IDE + ``` + The output will be the similar to the one shown in Figure 6 below: + + +
+ The output is presented in table format, with each column explained below: +
+- **IN_A**: Data present in model A
+- **IN_B**: Data present in model B
+- **COUNT**: Count of number of rows
+- **PERCENT_OF_TOTAL**: Percentage of total for all rows
+ + In the first row we can see `TRUE` in the column `IN_A` and `TRUE` in the column `IN_B`, which means that in both models, there are 1,966,215 rows with all columns matching values, accounting for 97.65% of the total. The second row shows 20,543 lines from model A that do not directly match with any row in model B, and the third row states conclusively that there are 26,720 lines from model B that do not directly match with any row in model A. + +:::tip Extra + +To know the difference between them (in the example above, the cumulative 2.35% difference), join both of the sources using the primary key and put the same columns side by side, and use a `where` clause to help you find where one column is different from model A when compared with model B. This inspection can be a first step to determine the possible cause of error. We recommend analyzing one column at a time. + +::: + +## Audit columns (`compare_column_values`) +We have talked about the `compare_queries` macro to audit rows, and how it can give the data analyst a nice overview of general compatibility ratio, with some flexibility to select specific column groups and apply business rules directly over the final audit query. + +But, despite that being a powerful tool, it does not solve all of the problems with data auditing between legacy and refactored models. + +While we can surely rely on that overview to validate the final refactored model with its legacy counterpart, it can be less useful while we are in the middle of the process of rebuilding a data workflow, where we need to track down which are exactly the columns that are causing incompatibility issues and what is wrong with them. + +A really useful way to check out which specific columns are driving down the match percentage between tables is the `compare_column_values` macro that allows us to audit column values. This macro requires a column to be set, so it can be used as an anchor to compare entries between the refactored dbt model column and the legacy table column. Figure 7 illustrates how the `compare_column_value`s macro works. + + + + +The macro’s output summarizes the status of column compatibility, breaking it down into different categories: perfect match, both are null, values do not match, value is null in A only, value is null in B only, missing from A and missing from B. This level of detailing makes it simpler for the AE or data analyst to figure out what can be causing incompatibility issues between the models. While refactoring a model, it is common that some keys used to join models are inconsistent, bringing up unwanted null values on the final model as a result, and that would cause the audit row query to fail, without giving much more detail. + +With the `compare_column_values` macro, the report output addresses specifically that problem, pointing out to the analyst what specific data inconsistencies exist. + +### The code for the `compare_column_values` macro: Step-by-step +1. Create a new `.sql` model in the folder of your choice in your dbt project +2. Copy and paste the following example below in the model created: + +```sql +{# in dbt Develop #} + +{% set old_etl_relation_query %} +select * from public.dim_product +where is_latest +{% endset %} + + +{% set new_etl_relation_query %} +select * from {{ ref('dim_product') }} +{% endset %} + + +{% set audit_query = audit_helper.compare_column_values( + a_query=old_etl_relation_query, + b_query=new_etl_relation_query, + primary_key="product_id", + column_to_compare="status" +) %} + + +{% set audit_results = run_query(audit_query) %} + + +{% if execute %} + {% do audit_results.print_table() %} +{% endif %} +``` + +The arguments used by this macro are pretty much the same used by the `compare_queries` macro, but with the addition of the `column_to_compare` argument, being that responsible, as the name suggests, for declaring which column is specifically going to be tested. + +The `compare_columns_value` macro requires the `primary_key` argument, which is going to be a fundamental part of correctly generating the metrics attributed to the compared column, acting as an anchor to compare every row from the refactored model with its legacy counterpart. + +Also, we can see that the example code includes a table printing option enabled as default. This prints the result of the query in the terminal when the macro is run during dbt compilation step. That can be useful to quickly check out the compatibility status of a column, without leaving the code editor while refactoring SQL through dbt. + +3. Replace column names and source references from the example with the respective information of the compared models. +4. Optional: Disable the `print_table()` command, so the model can be materialized on your data warehouse. + To disable table printing on terminal and to enable model materialization in your target data warehouse, the following can be done: replace this entire section of code, which declares the SQL model as variable and makes it printable, with the macro execution pattern. + + ```sql + - Replace the commented code below: + - {% set audit_results = run_query(audit_query) %} + + - {% if execute %} + - {% do audit_results.print_table() %} + - {% endif %} + + - With the following piece of code: + {{ audit_query }} + ``` + +5. To get the results, you can simply run the model as you would with a regular dbt model using the following command: + + `dbt run --select ` + + But unlike from the `compare_queries` macro, if you have kept the printing function enabled, you should expect a table to be printed in the command line when you run the model, as shown in Figure 8. Otherwise, it will be materialized on your data warehouse like this: + + + + The `compare_column_values` macro separates column auditing results in seven different labels: + - **Perfect match**: count of rows (and relative percentage) where the column values compared between both tables are equal and not null; + - **Both are null**: count of rows (and relative percentage) where column values compared between both tables are null; + - **Missing from A**: count of rows (and relative percentage) with column values that exist in table B, but not in table A; + - **Missing from B**: count of rows (and relative percentage) with column values that exist in table A, but not in table B; + - **Value is null in A only**: count of rows (and relative percentage) with column values that are not null in table B but are null in table A; + - **Value is null in B only**: count of rows (and relative percentage) with column values that are not null in table A but are null in table B; + - **Values do not match**: count of rows (and relative percentage) where the column values compared between both tables are different and not null. + + With this detailed report, it becomes easier for the AE to find out what could be going wrong with the data refactoring workflow, so the issue can be directly investigated and solved. Also, with some extra coding and orchestration, the column reports generated could be aggregated and put into a production environment, bringing data validation observability to an even higher level. + +## References + +Below, we listed the main references we consulted while writing this article, and we recommend reading them for further information on `audit_helper`. +- `Audit_helper`’s [GitHub repository](https://hub.getdbt.com/dbt-labs/audit_helper/latest/) +- dbt Labs post on data auditing ["How to not lose your mind when auditing data part 1"](https://discourse.getdbt.com/t/how-to-not-lose-your-mind-when-auditing-data/445) +- dbt Labs post on data auditing ["How to not lose your mind when auditing data part 2"](https://discourse.getdbt.com/t/how-to-not-lose-your-mind-when-auditing-data-part-ii/612) +- dbt Labs post on [how to migrate from SQL Stored Procedures to dbt models](https://docs.getdbt.com/blog/migrating-from-stored-procs) + + + + + + + + + + + + + + + diff --git a/website/blog/2023-03-30-guide-to-debug-in-jinja.md b/website/blog/2023-03-30-guide-to-debug-in-jinja.md new file mode 100644 index 00000000000..27935ad484f --- /dev/null +++ b/website/blog/2023-03-30-guide-to-debug-in-jinja.md @@ -0,0 +1,157 @@ +--- +title: "The missing guide to debug() in dbt" +description: Jinja brings a lot of automation and joy to dbt—it also brings additional complexity and required skills to succeed with it. In this Developer Blog, Benoit walks through the useful debug() command in Jinja to make debugging macros more efficient and intuitive. +authors: [benoit_perigaud] +slug: guide-to-jinja-debug +tags: [analytics craft] +hide_table_of_contents: false +date: 2023-03-29 +is_featured: true +--- + +*Editor's note—this post assumes intermediate knowledge of Jinja and macros development in dbt. For an introduction to Jinja in dbt check out [the documentation](https://docs.getdbt.com/docs/build/jinja-macros) and the free self-serve course on [Jinja, Macros, Pacakages](https://courses.getdbt.com/courses/jinja-macros-packages).* + +Jinja brings a lot of power to dbt, allowing us to use `ref()`, `source()` , conditional code, and [macros](https://docs.getdbt.com/docs/build/jinja-macros). But, while Jinja brings flexibility, it also brings complexity, and like many times with code, things can run in expected ways. + +The [`debug()`](https://docs.getdbt.com/reference/dbt-jinja-functions/debug-method) macro in dbt is a great tool to have in the toolkit for someone writing a lot of Jinja code, but it might be difficult to understand how to use it and what benefits it brings. + +Let’s dive into the last time I used `debug()` and how it helped me solve bugs in my code. + + + +## Jinja in dbt + +While working on a feature for [the dbt_project_evaluator package](https://github.com/dbt-labs/dbt-project-evaluator), my dbt runs started to fail consistently providing me the following message: + +```plain text +16:49:26 Database error while running on-run-end +16:49:26 Encountered an error: +Runtime Error + Parser Error: +``` +That's it!?!? +
+

via GIPHY

+
+ +As my `on-run-end` configuration in `dbt_project.yml` was the following, I was at least comfortable with pinpointing that the issue was with my macro `print_dbt_project_evaluator_issues`: + +```yaml +on-run-end: "{{ dbt_project_evaluator.print_dbt_project_evaluator_issues() }}" +``` + +But except for this insight, there was no mention of a specific line or downstream macro failing—so, the first step was trying to understand which part of my code was raising the error. My two options were: + +1. Write a bunch of `print("Here")` or `log("there", info=true)` statements in my macros and see which ones get printed and which ones don’t +2. Use the `debug()` command to both find where my code is failing and look at my variables when the code is running + +As you might have guessed, this guide is about option #2. + +## Intro to `debug()` in Jinja + +`debug()` is a command available in dbt used to set breakpoints in your Jinja code. Those breakpoints stop the execution of your code and provide the ability to inspect variables and to run the following part of your code step by step. + +### How to use it + +First of all, `debug()` is not available in dbt Cloud as it does not provide full access to the terminal, so, you will have to install and use `dbt-core` locally. + +Then, to enter into the debug mode, you need to both: + +- Write `{{ debug() }}` in your code—where you want to start the debugger—and +- set up the environment variable `DBT_MACRO_DEBUGGING` to any value. This can be done for the entire shell session by typing `export DBT_MACRO_DEBUGGING=1` in the command line or for each command, by prepending the whole command with the environment variable, like `DBT_MACRO_DEBUGGING=1 dbt build`. Without this variable set, the `debug()` command will not be evaluated and therefore you will not enter debug mode. + +### Back to our original issue, let’s use `debug` to pinpoint where our code has bugs + +If you put `{{ debug() }}` in one or multiple sections of your code, and while in debug mode you press `c` , the debugger will stop at each of your breakpoints, allowing you to find which part of the code is failing. + +In my case, + +```sql +{% set my_results = run_query(sql_statement) %} +{{ debug() }} +``` +failed without entering the debug mode, but +```sql +{{ debug() }} +{% set my_results = run_query(sql_statement) %} +``` +entered debug mode, telling me that there was something wrong with running my actual query. + +Now that we found where the issue is, can `debug()` help us fix it? Let’s look at the different commands available in the debugger. + +## Using the full power of Jinja debugging + +### debugging commands + +With the code in debug mode, we get a fully functional Python interactive debugger showing us this information: `ipdb>` (technically, `ipdb` stands for [IPython debugger](https://github.com/gotcha/ipdb)). + +The first command we can type is `h` to list the help and the available commands: + +```plain text +Documented commands (type help ): +======================================== +EOF clear display l pfile return tbreak where +a commands down list pinfo retval u +alias condition enable ll pinfo2 run unalias +args cont exit longlist pp rv undisplay +b context h n psource s unt +break continue help next q skip_hidden until +bt d ignore p quit skip_predicates up +c debug j pdef r source w +cl disable jump pdoc restart step whatis + +Miscellaneous help topics: +========================== +exec pdb + +Undocumented commands: +====================== +interact +``` + +This guide won't describe all the `ipdb` commands available to us, there are various online guides about the topic, but we will focus on the most useful ones in the majority of Jinja debugging cases: + +- `a`: Lists the current parameters for the functions you are in. +- `c`: Continue the execution of the code until the next breakpoint or the end of the program if there is no other breakpoint. +- `p` and `pp`: Print and pretty-print data. + - `p` will often print data in a single string, wrapped over multiple lines. + - `pp` will print the same information but will add newlines to make it easier to have a quick glance at a variable;`pp` is especially useful to print lists and dictionaries. + + +### Using the interactive prompt to solve our problem + +While in `ipdb`, you can also type some Python code to introspect your program and the current value of your variables. For example, typing `locals().keys()` or `p locals().keys()` returns the list of the current local variables (typing just `locals()` prints both the variable names and their values, which will most likely entirely fill your terminal). + +`ipdb` in Jinja won't return the list of variables with the exact same name as in your code, but you will see variables with very similar names, with just a prefix like `l_1_` or `l_2_` depending on the loops in your Jinja code. + +In my case, the debugger returns the following (shortened) list: + +```plain text +dict_keys(['l_1_schema_project_evaluator', 'l_1_db_project_evaluator', 't_2', ..., 'l_1_results', ..., 'l_2_graph', ..., 'l_2_sql_statement', 'environment', 'missing', 'resolve', 't_1', 'undefined']) +``` + +A tip recommend is to to look for variables with similar names to variables I either defined myself or read from in my code. Here, I can see a `l_2_sql_statement` as part of my variables list and can also print its value in my terminal by typing `p l_2_sql_statement`. + +Typing `p l_2_sql_statement` returned the following to my terminal: +```sql +`'\n select * from duck.main.model.dbt_project_evaluator.fct_documentation_coverage\n '` +``` + +We can directly see that there is an issue in the SQL generated as part of my macro as I am trying to read from `duck.main.model.dbt_project_evaluator.fct_documentation_coverage` (concatenating the database, schema and model unique id) instead of `duck.main.fct_documentation_coverage` (concatenating the database, schema and model table name). We found the issue. + +To fix it, we can then leverage the ability to modify variables while in debug mode. We can first assign a new value to the variable by typing `l_2_sql_statement = '\n select * from duck.main.fct_documentation_coverage\n '` and then typing `c` in the debugger to let the macro execute until it finishes or reaches a new breakpoint. In my case, the statement worked after I modified `l_2_sql_statement` and I can go back to the logic in my code to see why its value is not what I expected. + +### Using the debugger to analyze dbt Jinja variables + +The debugger can also be used to inspect the [out of the box Jinja variables and function made available with dbt](https://docs.getdbt.com/reference/dbt-jinja-functions). + +In my code, I was also looking at the [results object](https://docs.getdbt.com/reference/dbt-jinja-functions/on-run-end-context#results) available in the `on-run-end` context. We can actually see it in the previous list, called `l_1_results`. + +In the debugger, if I type, `type(l_1_results)`, the program tells me that this is a `list`. I can then run a `type(l_1_results[0])` and dbt now tells me that the type of the variable is a `dbt.contracts.results.RunResult`. + +My last step to analyze the results object is to type `pp l_1_results[0].to_dict()` and the CLI then returns a pretty-printed version of all the fields and values available in the first item of my `results` object. + +## Parting thoughts + +I hope that this short guide gave you an idea of how `debug()` could help you develop more efficiently Jinja code and investigate potential errors. And feel free to jump to [#advice-dbt-for-power-users](https://getdbt.slack.com/archives/C2JRRQDTL) in [the dbt Community Slack](https://www.getdbt.com/community/join-the-community/) if you want to discuss more in depth about debugging! + diff --git a/website/blog/2023-04-17-dbt-squared.md b/website/blog/2023-04-17-dbt-squared.md new file mode 100644 index 00000000000..5cac73459a8 --- /dev/null +++ b/website/blog/2023-04-17-dbt-squared.md @@ -0,0 +1,116 @@ +--- +title: "dbt Squared: Leveraging dbt Core and dbt Cloud together at scale" +description: "How do you effectively scale dbt? João Antunes from Roche walks through their multi-tool journey." +slug: dbt-squared + +authors: [joao_antunes, yannick_misteli, sean_mcintyre] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-04-17 +is_featured: true +--- + +Teams thrive when each team member is provided with the tools that best complement and enhance their skills. You wouldn’t hand Cristiano Ronaldo a tennis racket and expect a perfect serve! At Roche, getting the right tools in the hands of our teammates was critical to our ability to grow our data team from 10 core engineers to over 100 contributors in just two years. We embraced both dbt Core and dbt Cloud at Roche (a dbt-squared solution, if you will!) to quickly scale our data platform. + +We faced three key areas of growth along this journey to scale: technology, people, and processes in an all-or-nothing game—getting only one right wouldn’t cut it. + +- **Technology**: dbt on the CLI was the right tool for core engineers, but couldn’t scale to our affiliate teams. dbt Cloud catalyzed their contribution to the platform. +- **People**: We needed to onboard teams well-versed in SQL, but new to dbt, into our workflow with as little friction as possible. +- **Process**: Local CI/CD, GitOps, and development processes on the core team needed to be adapted to allow for more contributors, and enforce quality at scale. + +dbt Core jump started our data platform’s growth, and dbt Cloud allowed us to spread it across the globe. Today, we are able to: + +- Power our platform in 35 countries, and run over 15,000 models and 40,000 tests every day. +- Support our core and country teams with the workflows that best suit them. +- Promote code to production in two week cycles instead of the previous quarter or semester-long cycles. + +To understand the changes that we made, let's dive into what our technology, people and process looked like at the beginning of this path. + +## Where we started + +Our dbt journey at Roche started roughly 3 years ago when we began to build a cloud native content recommendation system tailored to our Sales Reps. We started with a small team of under 10 people and we designed our core architecture based on some well-defined principles: deploy Everything as Code, choose Serverless whenever feasible, and apply Extract Load Transform as opposed to Extract Transform Load. + +With these principles in mind, we designed a highly scalable architecture, leveraging AWS-native services to extract and load the data into an S3-based data lake and a Redshift cluster Data Warehouse.  Keep in mind, when we started, Redshift Serverless was not yet a thing! All of the data transformation occurs in the warehouse with (you guessed it!) dbt. + +The simplicity of dbt, combined with the compute power of Redshift, allowed us to implement a Data Vault architecture capable of supporting our content recommendation system. After the success here, there was interest to scale the platform to a plethora of new use cases in the pharma commercial domain. + +## The scalability problem + +Supporting the pharma domain meant we needed to reevaluate our dbt setup, as dozens of downstream teams would soon be building from the core team’s data assets. To be able to deliver the insights we wanted, we needed to get multiple teams collaborating on many dbt projects in a federated way without sacrificing quality or speed. + +### Technology + +We needed a way to make this architecture manageable when dozens of downstream teams were collaborating on the codebase simultaneously. Our first major architectural decision was how to separate the core team’s project from the country-specific projects, while still guaranteeing that each country team would be able to access the codebase of any other project if needed. Ensuring ease-of-access to other countries’ projects has a threefold purpose: + +1. Act as a catalyst for code reuse and best-practice sharing +2. Share common models and macros that span multiple countries across projects more easily +3. Promote a common workflow—an engineer working today for a Brazilian use-case could easily work tomorrow on a solution for Germany. + +The second architectural decision was whether or not to create a single dbt project for all 50+ country teams, or to follow a multi-project approach in which each country would have its own separate dbt project in the shared repo. It was critical that each country team was able to move at different paces and have full control over their domains. This would avoid issues like model name collisions across countries and remove dependencies that would risk cascading errors between countries. Therefore, we opted for a one project per country approach. + + + +The resulting data flow from core to country teams now follows this pattern. The *Sources* database holds all of the raw data in the Redshift cluster and the *Integrated* database contains the curated and ready-for-consumption outputs from the core dbt project. These outputs are termed Source Data Products (SDPs). These SDPs are then leveraged by the core team to build Global Data Products—products tailored to answering business questions for global stakeholders. They are also filtered at the country-level and used as sources to the country-specific Data Products managed by the country teams. These, in turn, are hosted in the respective `affiliate_db_` database. Segregating at the database-level facilitates data governance and privacy management. + + + +### People + +At the start of the journey, we built the core team from a blank canvas by cherry-picking individuals with a lot of engineering experience who were comfortable working on the command line. On the other hand, the country teams comprised people working on legacy data systems at the company—some with a deep understanding of technologies like Informatica, Talend, or Hadoop. All had one thing in common—no one had ever used dbt. + +We quickly realized that we needed to lower the barrier of entry for new teams to start leveraging the data platform. We wanted to relieve the teams from unnecessary exposure to overly complex, hard-to-read features in the core repo, and empower them to focus on their data modeling work exclusively. While dbt was clearly the right transformation tool, lack of experience on the command line demanded a more approachable, ready-to-use tool for these teams. + +### Process + +The success of this program relied on adopting DevOps practices from the start. This required a cultural shift, which can be particularly challenging in large scale organizations. We needed to take the DevOps processes that were working well for the core team, and scale them to dozens of teams to ensure the same level of quality and consistency in our data products. By seamlessly integrating dbt with git, our CI/CD processes were able to scale effortlessly, allowing for automated testing, building, and releasing of our pipelines. + +Often overlooked, this third pillar of process can be the key to success when scaling a global platform. Simple things, such as accounting for time zone differences, can determine whether a message gets across the board. To facilitate the communication and coordination between Global and Country teams, all the teams follow the same sprint cycle, and we hold weekly scrum of scrums. We needed to set up extensive onboarding documentation, ensure newcomers had proper training and guidance, and create dedicated slack channels for announcements, incident reporting, and occasional random memes, helping build a community that stretches from Brazil to Malaysia. + + + +## The solution: dbt Squared + +### Technology and people + +Our teams’ differing levels of technical maturity demanded different technical solutions. The core team was well versed with dbt Core (even [contributing](https://github.com/dbt-labs/dbt-core/pull/3408) to the project!) and had been working on the same project for 3 years with software engineering best practices  (i.e. CI/CD, unit tests, linting, and pre-commit hooks). The affiliate teams had noticeably different exposure to these tools. Thus, we rolled out dbt Cloud to the country teams to avoid onboarding them to complex core workflows, which would have unnecessarily slowed them down. + +dbt Cloud removed the need to set up local environments; no need to worry about library versions or about installing git clients. Instead, they were quickly building and testing dbt models. As SQL is second nature to all of the country teams (irrespective of the platform they were using prior to dbt), they picked up dbt in no time at all, and the need for support from the core team quickly became minimal. + +This autonomy proved to be critical; it would otherwise be impractical to have a fully-centralized support team. We appointed regional leads to oversee the work of multiple country teams. This made country teams less reliant on core teams; now different countries could collaborate on dbt work independently. + +Doubling down on dbt Cloud had a big impact on how fast we could grow without compromising key features of software development. In the past, the initial setup of the tooling needed to start building data pipelines can take days. With this solution, code versioning, IDE, SQL previewer and lineage graphs were all in one place without any initial setup needed from the developers. In a matter of weeks, we started seeing the first data pipelines fully migrated. + +### Process + +Operating at scale meant we needed to adapt our processes that once worked for a  core team of ten to now work for a team of hundreds. + +- **Project Setup**: We needed to have a scalable way to provision new projects for the first time. +- **Development Flow**: We needed to make sure any team member could develop seamlessly, no matter how far downstream they sat. +- **Release Flow**: Releasing to one project was straightforward, but releasing to connected projects simultaneously needed considerable coordination. + +### Project setup flow + +Because we decided to go with a multi-project architecture, there was some initial setup needed. Each country would need a dbt project in our Git repository and also needed to be deployed in dbt Cloud…twice, as each affiliate has a dev and a prod project. To avoid setting up all of the projects manually in the dbt Cloud UI, we implemented a python library as a wrapper around the [dbt Cloud Administrative API](https://docs.getdbt.com/docs/dbt-cloud-apis/admin-cloud-api).  This would read a YAML configuration file and deploy all of the projects automatically. This saved a lot of time, as we would already have a new team’s dbt project setup in both the git repository and dbt Cloud as soon as they were ready to start building. + +### Development flow + +The core teams using dbt on the CLI often leveraged the [defer command](https://docs.getdbt.com/reference/node-selection/defer), which can be used in dbt Cloud, but requires a [workaround](https://discourse.getdbt.com/t/possible-to-use-defer-to-testing-time-in-cloud-ide/6189) that involves injecting a production manifest file into your repo. Several rounds of fruitful discussions with the dbt Labs team lead us towards using  [“Proxy Views”](https://gist.github.com/boxysean/c1e0cb6735f6bbbb422cb06a14c3cd92), which emulate zero-copy clone functionality and allows for a similar `defer` workflow. For this solution to work, we also needed to override the `redshift__list_relations_without_caching` macro (for more details please read the comments of our Lead Engineer Jakub Lanski [here](https://github.com/dbt-labs/dbt-redshift/issues/179)). This enables each engineer to develop and test their models without the need to entirely recreate the upstream dependencies. Instead, these upstream model dependencies are created as views in the developer’s target schema that point to their production counterparts. This is particularly critical when implementing models that rely on dozens of upstream dependencies. By avoiding unnecessary data replication, we dramatically reduced development time. + +### Release flow + +Last but not least, the core team uses the [pre-commit](https://pre-commit.com/) framework to ensure code quality before opening merge requests to the common branch. The core team also leverages [sqlfluff](https://sqlfluff.com/) to standardize the code across several streams of work. Since dbt Cloud doesn’t yet offer the possibility to run the pre-commit hooks directly in the IDE, we migrated these workflows to CI checks. These checks are triggered when a merge request to the common branch is raised, guaranteeing that even if a developer is not using the framework locally, the code changes are evaluated before the merge is completed. + +Now, not only was the pace of delivery much faster, we were also able to make investments in the incident management process. Rather than relying on a separate operations team, we allocate part of the development team to incident management, and we rotate the team members responsible for incident management on a sprint-by-sprint basis. As a result, we achieved a widespread culture of accountability that ultimately led to increased test coverage and code reliability. + +## Conclusion + +In less than one year, we managed to migrate siloed data pipelines from tools like Informatica, Spark, Talend and Oracle into dbt, powering close to 50 dashboards today. + +While we acknowledge the success story so far, we also believe the future of this endeavor depends heavily on how much we continue to invest in people. Therefore, we are creating a dbt fast-track path to prepare our team leads to earn [the dbt Certification](https://www.getdbt.com/blog/dbt-certification-program/). We foster close collaboration with the dbt Labs team which helps our organization set out for success as we plan our roadmap with expert advice. + +While successful scaling requires good technology, it also requires empowering your people and establishing strong processes.  Be sure to prioritize collaboration, communication, and training as you grow your dbt footprint. We hope this post has given you some useful insight and strategies for scaling the use of dbt in your organization. If you're facing similar challenges or have found other effective solutions, we'd love to hear from you in the comments below. + + +*Editor's note: This article was written by the João Antunes and Yannick Misteli of Roche, with editorial and technical guidance from Sean McIntyre of dbt Labs* \ No newline at end of file diff --git a/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md b/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md new file mode 100644 index 00000000000..ffc0369a908 --- /dev/null +++ b/website/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt.md @@ -0,0 +1,710 @@ +--- +title: "Building a Kimball dimensional model with dbt" +description: "Tackle dimensional modeling in dbt with this step-by-step tutorial from Jonathan Neo of Canva." +slug: kimball-dimensional-model + +authors: [jonathan_neo] + +tags: [analytics craft, dbt tutorials] +hide_table_of_contents: false + +date: 2023-04-20 +is_featured: true +--- + +Dimensional modeling is one of many data modeling techniques that are used by data practitioners to organize and present data for analytics. Other data modeling techniques include Data Vault (DV), Third Normal Form (3NF), and One Big Table (OBT) to name a few. + + + +While the relevance of dimensional modeling [has been debated by data practitioners](https://discourse.getdbt.com/t/is-kimball-dimensional-modeling-still-relevant-in-a-modern-data-warehouse/225/6), it is still one of the most widely adopted data modeling technique for analytics. + +Despite its popularity, resources on how to create dimensional models using dbt remain scarce and lack detail. This tutorial aims to solve this by providing the definitive guide to dimensional modeling with dbt. + +By the end of this tutorial, you will: + +- Understand dimensional modeling concepts +- Set up a mock dbt project and database +- Identify the business process to model +- Identify the fact and dimension tables +- Create the dimension tables +- Create the fact table +- Document the dimensional model relationships +- Consume the dimensional model + + + +## Dimensional modeling + +Dimensional modeling is a technique introduced by Ralph Kimball in 1996 with his book, [The Data Warehouse Toolkit](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/books/data-warehouse-dw-toolkit/). + +The goal of dimensional modeling is to take raw data and transform it into Fact and Dimension tables that represent the business. + + + +The benefits of dimensional modeling are: + +- **Simpler data model for analytics**: Users of dimensional models do not need to perform complex joins when consuming a dimensional model for analytics. Performing joins between fact and dimension tables are made simple through the use of surrogate keys. +- Don’t repeat yourself: Dimensions can be easily re-used with other fact tables to avoid duplication of effort and code logic. Reusable dimensions are referred to as conformed dimensions. +- **Faster data retrieval**: Analytical queries executed against a dimensional model are significantly faster than a 3NF model since data transformations like joins and aggregations have been already applied. +- **Close alignment with actual business processes**: Business processes and metrics are modeled and calculated as part of dimensional modeling. This helps ensure that the modeled data is easily usable. + +Now that we understand the broad concepts and benefits of dimensional modeling, let’s get hands-on and create our first dimensional model using dbt. + +## Part 1: Setup dbt project and database + +### Step 1: Before you get started + +Before you can get started: + +- You must have either DuckDB or PostgreSQL installed. Choose one, and download and install the database using one of the following links: + - Download [DuckDB](https://duckdb.org/docs/installation/index) + - Download [PostgreSQL](https://www.postgresql.org/download/) +- You must have Python 3.8 or above installed +- You must have dbt version 1.3.0 or above installed +- You should have a basic understanding of [SQL](https://www.sqltutorial.org/) +- You should have a basic understanding of [dbt](https://docs.getdbt.com/quickstarts) + +### Step 2: Clone the repository + +Clone the [github repository](https://github.com/Data-Engineer-Camp/dbt-dimensional-modelling) by running this command in your terminal: + +```text +git clone https://github.com/Data-Engineer-Camp/dbt-dimensional-modelling.git +cd dbt-dimensional-modelling/adventureworks +``` + +### Step 3: Install dbt database adaptors + +Depending on which database you’ve chosen, install the relevant database adaptor for your database: + +```text +# install adaptor for duckdb +pip install dbt-duckdb + +# OR + +# install adaptor for postgresql +pip install dbt-postgres +``` + +### Step 4: Setup dbt profile + +The dbt profile (see `adventureworks/profiles.yml`) has already been pre-configured for you. Verify that the configurations are set correctly based on your database credentials: + +```yaml +adventureworks: + target: duckdb # leave this as duckdb, or change this to your chosen database + + # supported databases: duckdb, postgres + outputs: + duckdb: + type: duckdb + path: target/adventureworks.duckdb + threads: 12 + + postgres: + type: postgres + host: localhost + user: postgres + password: postgres + port: 5432 + dbname: adventureworks # create this empty database beforehand + schema: dbo + threads: 12 +``` + +### Step 5: Install dbt dependencies + +We use packages like [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) in this project, and we need to install the libraries for this package by running the command: + +``` +dbt deps +``` + +### Step 6: Seed your database + +We are using [dbt seeds](https://docs.getdbt.com/docs/build/seeds) (see `adventureworks/seeds/*`) to insert AdventureWorks data into your database: + +```text +# seed duckdb +dbt seed --target duckdb + +# seed postgres +dbt seed --target postgres +``` + +### Step 7: Examine the database source schema + +All data generated by the business is stored on an OLTP database. The Entity Relationship Diagram (ERD) of the database has been provided to you. + +Examine the database source schema below, paying close attention to: + +- Tables +- Keys +- Relationships + + + +### Step 8: Query the tables + +Get a better sense of what the records look like by executing select statements using your database's SQL editor. + +For example: + +```sql +select * from sales.salesorderheader limit 10; +``` + +Output: + +``` +┌──────────────┬──────────────┬─────────────────┬───┬───────────────┬─────────────────────┬────────────────┐ +│ salesorderid │ shipmethodid │ billtoaddressid │ … │ salespersonid │ shipdate │ accountnumber │ +│ int32 │ int32 │ int32 │ │ int32 │ timestamp │ varchar │ +├──────────────┼──────────────┼─────────────────┼───┼───────────────┼─────────────────────┼────────────────┤ +│ 43659 │ 5 │ 985 │ … │ 279 │ 2011-06-07 00:00:00 │ 10-4020-000676 │ +│ 43660 │ 5 │ 921 │ … │ 279 │ 2011-06-07 00:00:00 │ 10-4020-000117 │ +│ 43661 │ 5 │ 517 │ … │ 282 │ 2011-06-07 00:00:00 │ 10-4020-000442 │ +│ 43662 │ 5 │ 482 │ … │ 282 │ 2011-06-07 00:00:00 │ 10-4020-000227 │ +│ 43663 │ 5 │ 1073 │ … │ 276 │ 2011-06-07 00:00:00 │ 10-4020-000510 │ +│ 43664 │ 5 │ 876 │ … │ 280 │ 2011-06-07 00:00:00 │ 10-4020-000397 │ +│ 43665 │ 5 │ 849 │ … │ 283 │ 2011-06-07 00:00:00 │ 10-4020-000146 │ +│ 43666 │ 5 │ 1074 │ … │ 276 │ 2011-06-07 00:00:00 │ 10-4020-000511 │ +│ 43667 │ 5 │ 629 │ … │ 277 │ 2011-06-07 00:00:00 │ 10-4020-000646 │ +│ 43668 │ 5 │ 529 │ … │ 282 │ 2011-06-07 00:00:00 │ 10-4020-000514 │ +├──────────────┴──────────────┴─────────────────┴───┴───────────────┴─────────────────────┴────────────────┤ +│ 10 rows 23 columns (6 shown) │ +└──────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +When you’ve successfully set up the dbt project and database, we can now move into the next part to identify the tables required for a dimensional model. + +## Part 2: Identify the business process + +Now that you’ve set up the dbt project, database, and have taken a peek at the schema, it’s time for you to identify the business process. + +Identifying the business process is done in collaboration with the business user. The business user has context around the business objectives and business processes, and can provide you with that information. + + + +Upon speaking with the CEO of AdventureWorks, you learn the following information: + +> AdventureWorks manufactures bicycles and sells them to consumers (B2C) and businesses (B2B). The bicycles are shipped to customers from all around the world. As the CEO of the business, I would like to know how much revenue we have generated for the year ending 2011, broken down by: +- Product category and subcategory +- Customer +- Order status +- Shipping country, state, and city + +Based on the information provided by the business user, you have identified that the business process in question is the ***Sales process***. In the next part, you are going to design a dimensional model for the Sales process. + +## Part 3: Identify the fact and dimension tables + +Based on the information provided from the earlier part, we want to create a dimensional model that represents that business’ Sales process and also be able to slice and dice the data by: + +- Product category and subcategory +- Customer +- Order status +- Shipping country, state, and city +- Date (year, month, day) + +### Fact tables + +:::info +[Fact tables](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/facts-for-measurement/) are database tables that represent a business process in the real world. Each record in the fact table represents a business event such as a: + +- Item sale +- Website click +- Production work order +::: + +There are two tables in the sales schema that catch our attention. These two tables can be used to create the fact table for the sales process: + +- The `sales.salesorderheader` table contains information about the credit card used in the order, the shipping address, and the customer. Each record in this table represents an order header that contains one or more order details. +- The `sales.salesorderdetail` table contains information about the product that was ordered, and the order quantity and unit price, which we can use to calculate the revenue. Each record in this table represents a single order detail. + + + +Let’s define a fact table called `fct_sales` which joins `sales.salesorderheader` and `sales.salesorderdetail` together. Each record in the fact table (also known as the [grain](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/grain/)) is an order detail. + + + +### Dimension tables + +:::info +[Dimension tables](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/dimensions-for-context/) are used to represent contextual or descriptive information for a business process event. Examples of dimensions include: + +- Customer details: Who is the customer for a particular order number? +- Website click location details: Which button is the user clicking on? +- Product details: What are the details of the product that was added to the cart? +::: + +Based on the business questions that our business user would like answered, we can identify several tables that would contain useful contextual information for our business process: + +- `person.address` +- `person.countryregion` +- `production.product` +- `production.productcategory` +- `sales.customer` +- `sales.store` +- And many more … + +There are different ways we could create the dimension tables. We could use the existing relationships between the tables as depicted in the diagram below. + + + +This is known as a snowflake schema design, where the fact table is the centre of the snowflake, and there are many fractals branching off the centre of the snowflake. However, this results in many joins that need to be performed by the consumer of the dimensional model. + +Instead, we can denormalize the dimension tables by performing joins. + + + +This is known as a star schema and this approach reduces the amount of joins that need to be performed by the consumer of the dimensional model. + +Using the star schema approach, we can identify 6 dimensions as shown below that will help us answer the business questions: + + + +- `dim_product` : a dimension table that joins `product` , `productsubcategory`, `productcategory` +- `dim_address` : a dimension table that joins `address` , `stateprovince`, `countryregion` +- `dim_customer` : a dimension table that joins `customer` , `person` , `store` +- `dim_credit_card` : a dimension table created from `creditcard` +- `dim_order_status` : a dimension table created by taking distinct statuses from `salesorderheader` +- `dim_date` : a specially generated dimension table containing date attributes using the [dbt_date](https://hub.getdbt.com/calogica/dbt_date/latest/) package. + +:::note +We have manually seeded the `dim_date` table since DuckDB is not supported by the dbt_date package. +::: + +In the next part, we use dbt to create the fact and dimension tables we have identified. + +## Part 4: Create the dimension tables + +Let's first create `dim_product` . The other dimension tables will use the same steps that we’re about to go through. + +### Step 1: Create model files + +Let’s create the new dbt model files that will contain our transformation code. Under `adventureworks/models/marts/` , create two files: + +- `dim_product.sql` : This file will contain our SQL transformation code. +- `dim_product.yml` : This file will contain our documentation and tests for `dim_product` . + +``` +adventureworks/models/ +└── marts + ├── dim_product.sql + ├── dim_product.yml +``` + +### Step 2: Fetch data from the upstream tables + +In `dim_product.sql`, you can select data from the upstream tables using Common Table Expressions (CTEs). + +```sql +with stg_product as ( + select * + from {{ ref('product') }} +), + +stg_product_subcategory as ( + select * + from {{ ref('productsubcategory') }} +), + +stg_product_category as ( + select * + from {{ ref('productcategory') }} +) + +... +``` + +We use the `ref` function to reference the upstream tables and create a Directed Acyclic Graph (DAG) of the dependencies. + +### Step 3: Perform the joins + +Next, perform the joins between the CTE tables using the appropriate join keys. + +```sql +... + +select + ... +from stg_product +left join stg_product_subcategory on stg_product.productsubcategoryid = stg_product_subcategory.productsubcategoryid +left join stg_product_category on stg_product_subcategory.productcategoryid = stg_product_category.productcategoryid +``` + +### Step 4: Create the surrogate key + +:::info +[Surrogate keys](https://www.kimballgroup.com/1998/05/surrogate-keys/) provide consumers of the dimensional model with an easy-to-use key to join the fact and dimension tables together, without needing to understand the underlying business context. +::: + +There are several approaches to creating a surrogate key: + +- **Hashing surrogate key**: a surrogate key that is constructed by hashing the unique keys of a table (e.g. `md5(key_1, key_2, key_3)` ). +- **Incrementing surrogate key**: a surrogate key that is constructed by using a number that is always incrementing (e.g. `row_number()`). +- **Concatenating surrogate key**: a surrogate key that is constructed by concatenating the unique key columns (e.g. `concat(key_1, key_2, key_3)` ). + +We are using arguably the easiest approach which is to perform a hash on the unique key columns of the dimension table. This approach removes the hassle of performing a join with dimension tables when generating the surrogate key for the fact tables later. + +To generate the surrogate key, we use a dbt macro that is provided by the `dbt_utils` package called `generate_surrogate_key()` . The generate surrogate key macro uses the appropriate hashing function from your database to generate a surrogate key from a list of key columns (e.g. `md5()`, `hash()`). Read more about the [generate_surrogate_key macro](https://docs.getdbt.com/blog/sql-surrogate-keys). + +```sql +... + +select + {{ dbt_utils.generate_surrogate_key(['stg_product.productid']) }} as product_key, + ... +from stg_product +left join stg_product_subcategory on stg_product.productsubcategoryid = stg_product_subcategory.productsubcategoryid +left join stg_product_category on stg_product_subcategory.productcategoryid = stg_product_category.productcategoryid +``` + +### Step 5: Select dimension table columns + +You can now select the dimension table columns so that they can be used in conjunction with the fact table later. We select columns that will help us answer the business questions identified earlier. + +```sql +... + +select + {{ dbt_utils.generate_surrogate_key(['stg_product.productid']) }} as product_key, + stg_product.productid, + stg_product.name as product_name, + stg_product.productnumber, + stg_product.color, + stg_product.class, + stg_product_subcategory.name as product_subcategory_name, + stg_product_category.name as product_category_name +from stg_product +left join stg_product_subcategory on stg_product.productsubcategoryid = stg_product_subcategory.productsubcategoryid +left join stg_product_category on stg_product_subcategory.productcategoryid = stg_product_category.productcategoryid +``` + +### Step 6: Choose a materialization type + +You may choose from one of the following materialization types supported by dbt: + +- View +- Table +- Incremental + +It is common for dimension tables to be materialized as `table` or `view` since the data volumes in dimension tables are generally not very large. In this example, we have chosen to go with `table`, and have set the materialization type for all dimensional models in the `marts` schema to `table` in `dbt_project.yml` + +```sql +models: + adventureworks: + marts: + +materialized: table + +schema: marts +``` + +### Step 7: Create model documentation and tests + +Alongside our `dim_product.sql` model, we can populate the corresponding `dim_product.yml` file to document and test our model. + +```yaml +version: 2 + +models: + - name: dim_product + columns: + - name: product_key + description: The surrogate key of the product + tests: + - not_null + - unique + - name: productid + description: The natural key of the product + tests: + - not_null + - unique + - name: product_name + description: The product name + tests: + - not_null +``` + +### Step 8: Build dbt models + +Execute the [dbt run](https://docs.getdbt.com/reference/commands/run) and [dbt test](https://docs.getdbt.com/reference/commands/run) commands to run and test your dbt models: + +``` +dbt run && dbt test +``` + +We have now completed all the steps to create a dimension table. We can now repeat the same steps to all dimension tables that we have identified earlier. Make sure to create all dimension tables before moving on to the next part. + +## Part 5: Create the fact table + +After we have created all required dimension tables, we can now create the fact table for `fct_sales`. + +### Step 1: Create model files + +Let’s create the new dbt model files that will contain our transformation code. Under `adventureworks/models/marts/` , create two files: + +- `fct_sales.sql` : This file will contain our SQL transformation code. +- `fct_sales.yml` : This file will contain our documentation and tests for `fct_sales` . + +``` +adventureworks/models/ +└── marts + ├── fct_sales.sql + ├── fct_sales.yml +``` + +### Step 2: Fetch data from the upstream tables + +To answer the business questions, we need columns from both `salesorderheader` and `salesorderdetail`. Let’s reflect that in `fct_sales.sql` : + +```sql +with stg_salesorderheader as ( + select + salesorderid, + customerid, + creditcardid, + shiptoaddressid, + status as order_status, + cast(orderdate as date) as orderdate + from {{ ref('salesorderheader') }} +), + +stg_salesorderdetail as ( + select + salesorderid, + salesorderdetailid, + productid, + orderqty, + unitprice, + unitprice * orderqty as revenue + from {{ ref('salesorderdetail') }} +) + +... +``` + +### Step 3: Perform joins + +The grain of the `fct_sales` table is one record in the SalesOrderDetail table, which describes the quantity of a product within a SalesOrderHeader. So we perform a join between `salesorderheader` and `salesorderdetail` to achieve that grain. + +```sql +... + +select + ... +from stg_salesorderdetail +inner join stg_salesorderheader on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid +``` + +### Step 4: Create the surrogate key + +Next, we create the surrogate key to uniquely identify each row in the fact table. Each row in the `fct_sales` table can be uniquely identified by the `salesorderid` and the `salesorderdetailid` which is why we use both columns in the `generate_surrogate_key()` macro. + +```sql +... + +select + {{ dbt_utils.generate_surrogate_key(['stg_salesorderdetail.salesorderid', 'salesorderdetailid']) }} as sales_key, + ... +from stg_salesorderdetail +inner join stg_salesorderheader on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid +``` + +### Step 5: Select fact table columns + +You can now select the fact table columns that will help us answer the business questions identified earlier. We want to be able to calculate the amount of revenue, and therefore we include a column revenue per sales order detail which was calculated above by `unitprice * orderqty as revenue` . + +```sql +... + +select + {{ dbt_utils.generate_surrogate_key(['stg_salesorderdetail.salesorderid', 'salesorderdetailid']) }} as sales_key, + stg_salesorderdetail.salesorderid, + stg_salesorderdetail.salesorderdetailid, + stg_salesorderdetail.unitprice, + stg_salesorderdetail.orderqty, + stg_salesorderdetail.revenue +from stg_salesorderdetail +inner join stg_salesorderheader on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid +``` + +### Step 6: Create foreign surrogate keys + +We want to be able to slice and dice our fact table against the dimension tables we have created in the earlier step. So we need to create the foreign surrogate keys that will be used to join the fact table back to the dimension tables. + +We achieve this by applying the `generate_surrogate_key()` macro to the same unique id columns that we had previously used when generating the surrogate keys in the dimension tables. + +```sql +... + +select + {{ dbt_utils.generate_surrogate_key(['stg_salesorderdetail.salesorderid', 'salesorderdetailid']) }} as sales_key, + {{ dbt_utils.generate_surrogate_key(['productid']) }} as product_key, + {{ dbt_utils.generate_surrogate_key(['customerid']) }} as customer_key, + {{ dbt_utils.generate_surrogate_key(['creditcardid']) }} as creditcard_key, + {{ dbt_utils.generate_surrogate_key(['shiptoaddressid']) }} as ship_address_key, + {{ dbt_utils.generate_surrogate_key(['order_status']) }} as order_status_key, + {{ dbt_utils.generate_surrogate_key(['orderdate']) }} as order_date_key, + stg_salesorderdetail.salesorderid, + stg_salesorderdetail.salesorderdetailid, + stg_salesorderdetail.unitprice, + stg_salesorderdetail.orderqty, + stg_salesorderdetail.revenue +from stg_salesorderdetail +inner join stg_salesorderheader on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid +``` + +### Step 7: Choose a materialization type + +You may choose from one of the following materialization types supported by dbt: + +- View +- Table +- Incremental + +It is common for fact tables to be materialized as `incremental` or `table` depending on the data volume size. [As a rule of thumb](https://docs.getdbt.com/docs/build/incremental-models#when-should-i-use-an-incremental-model), if you are transforming millions or billions of rows, then you should start using the `incremental` materialization. In this example, we have chosen to go with `table` for simplicity. + +### Step 8: Create model documentation and tests + +Alongside our `fct_sales.sql` model, we can populate the corresponding `fct_sales.yml` file to document and test our model. + +```yaml +version: 2 + +models: + - name: fct_sales + columns: + + - name: sales_key + description: The surrogate key of the fct sales + tests: + - not_null + - unique + + - name: product_key + description: The foreign key of the product + tests: + - not_null + + - name: customer_key + description: The foreign key of the customer + tests: + - not_null + + ... + + - name: orderqty + description: The quantity of the product + tests: + - not_null + + - name: revenue + description: The revenue obtained by multiplying unitprice and orderqty +``` + +### Step 9: Build dbt models + +Execute the [dbt run](https://docs.getdbt.com/reference/commands/run) and [dbt test](https://docs.getdbt.com/reference/commands/run) commands to run and test your dbt models: + +``` +dbt run && dbt test +``` + +Great work, you have successfully created your very first fact and dimension tables! Our dimensional model is now complete!! 🎉  + +## Part 6: Document the dimensional model relationships + +Let’s make it easier for consumers of our dimensional model to understand the relationships between tables by creating an [Entity Relationship Diagram (ERD)](https://www.visual-paradigm.com/guide/data-modeling/what-is-entity-relationship-diagram/). + + + +The ERD will enable consumers of our dimensional model to quickly identify the keys and relationship type (one-to-one, one-to-many) that need to be used to join tables. + +## Part 7: Consume dimensional model + +Finally, we can consume our dimensional model by connecting our data warehouse to our Business Intelligence (BI) tools such as Tableau, Power BI, and Looker. + +Most modern BI tools have a built-in semantic layer that supports relationships between tables, which is required if we want to consume the dimensional models directly without any additional data transformation. + +In Looker for example, we can define relationships using [LookML](https://cloud.google.com/looker/docs/what-is-lookml): + +``` +explore: fct_order { + join: dim_user { + sql_on: ${fct_order.user_key} = ${dim_user.user_key} ;; + relationship: many_to_one + } +} +``` + +If your BI tool doesn’t have a semantic layer that supports relationships, then you will have to reflect that relationship by creating a One Big Table (OBT) that joins the fact table against all of its dimension tables. + +```sql +with f_sales as ( + select * from {{ ref('fct_sales') }} +), + +d_customer as ( + select * from {{ ref('dim_customer') }} +), + +d_credit_card as ( + select * from {{ ref('dim_credit_card') }} +), + +d_address as ( + select * from {{ ref('dim_address') }} +), + +d_order_status as ( + select * from {{ ref('dim_order_status') }} +), + +d_product as ( + select * from {{ ref('dim_product') }} +), + +d_date as ( + select * from {{ ref('dim_date') }} +) + +select + {{ dbt_utils.star(from=ref('fct_sales'), relation_alias='f_sales', except=[ + "product_key", "customer_key", "creditcard_key", "ship_address_key", "order_status_key", "order_date_key" + ]) }}, + {{ dbt_utils.star(from=ref('dim_product'), relation_alias='d_product', except=["product_key"]) }}, + {{ dbt_utils.star(from=ref('dim_customer'), relation_alias='d_customer', except=["customer_key"]) }}, + {{ dbt_utils.star(from=ref('dim_credit_card'), relation_alias='d_credit_card', except=["creditcard_key"]) }}, + {{ dbt_utils.star(from=ref('dim_address'), relation_alias='d_address', except=["address_key"]) }}, + {{ dbt_utils.star(from=ref('dim_order_status'), relation_alias='d_order_status', except=["order_status_key"]) }}, + {{ dbt_utils.star(from=ref('dim_date'), relation_alias='d_date', except=["date_key"]) }} +from f_sales +left join d_product on f_sales.product_key = d_product.product_key +left join d_customer on f_sales.customer_key = d_customer.customer_key +left join d_credit_card on f_sales.creditcard_key = d_credit_card.creditcard_key +left join d_address on f_sales.ship_address_key = d_address.address_key +left join d_order_status on f_sales.order_status_key = d_order_status.order_status_key +left join d_date on f_sales.order_date_key = d_date.date_key +``` + +In the OBT above, we perform joins between the fact and dimension tables using the surrogate keys. + +Using `dbt_utils.star()`, we select all columns except the surrogate key columns since the surrogate keys don't hold any meaning besides being useful for the joins. + +We can then build the OBT by running `dbt run`. Your dbt DAG should now look like this: + + + +Congratulations, you have reached the end of this tutorial. If you want to learn more, please see the learning resources below on dimensional modeling. + +## Learning resources + +- [Kimball group learning resources](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/) +- [The Data Warehouse toolkit book](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/books/data-warehouse-dw-toolkit/) +- [dbt discourse on whether dimensional modeling is still relevant](https://discourse.getdbt.com/t/is-kimball-dimensional-modeling-still-relevant-in-a-modern-data-warehouse/225) +- [dbt glossary on dimensional modeling](https://docs.getdbt.com/terms/dimensional-modeling) + +If you have any questions about the material, please reach out to me on the dbt Community Slack (@Jonathan Neo), or on [LinkedIn](https://www.linkedin.com/in/jonneo/). + +*Author's note: The materials in this article were created by [Data Engineer Camp](https://dataengineercamp.com/), a 16-week data engineering bootcamp for professionals looking to transition to data engineering and analytics engineering. The article was written by Jonathan Neo, with editorial and technical guidance from [Kenny Ning](https://www.linkedin.com/in/kenny-ning/) and editorial review from [Paul Hallaste](https://www.linkedin.com/in/paulhallaste/) and [Josh Devlin](https://www.linkedin.com/in/josh-devlin/).* diff --git a/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md b/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md new file mode 100644 index 00000000000..c5b677f7f3e --- /dev/null +++ b/website/blog/2023-04-24-framework-refactor-alteryx-dbt.md @@ -0,0 +1,134 @@ +--- +title: "How we reduced a 6-hour runtime in Alteryx to 9 minutes with dbt and Snowflake" +description: Learn how the folks at Indicium Tech leveraged the modularity and visibility features of dbt to reduce a 6-hour runtime in Alteryx to only 9 minutes by implementing a refactoring workflow. +authors: [arthur_marcon, lucas_bergodias, christian_vanbellen] +slug: framework-refactor-alteryx-dbt +tags: [analytics craft] +hide_table_of_contents: false +date: 2023-04-25 +is_featured: true +--- + +Alteryx is a visual data transformation platform with a user-friendly interface and drag-and-drop tools. Nonetheless, Alteryx may have difficulties to cope with the complexity increase within an organization’s data pipeline, and it can become a suboptimal tool when companies start dealing with large and complex data transformations. In such cases, moving to dbt can be a natural step, since dbt is designed to manage complex data transformation pipelines in a scalable, efficient, and more explicit manner. Also, this transition involved migrating from on-premises SQL Server to Snowflake cloud computing. In this article, we describe the differences between Alteryx and dbt, and how we reduced a client's 6-hour runtime in Alteryx to 9 minutes with dbt and Snowflake at Indicium Tech. + + + +## Introduction + +Transforming data to follow business rules can be a complex task, especially with the increasing amount of data collected by companies. To reduce such complexity, data transformation solutions designed as drag-and-drop tools can be seen as more intuitive, since analysts can visualize the steps taken to transform data. One example of a popular drag-and-drop transformation tool is Alteryx which allows business analysts to transform data by dragging and dropping operators in a canvas. The graphic interface of Alteryx Designer is presented in **Figure 1**. + + + +Nonetheless, as data workflows become more complex, Alteryx lacks the modularity, documentation, and version control capabilities that these flows require. In this sense, dbt may be a more suitable solution to building resilient and modular data pipelines due to its focus on data modeling. + +**This article reports our experience migrating a large client's data workflow from Alteryx to dbt over the course of three months. After model refactoring, model runtime was reduced from 6 hours to 9 minutes in dbt, with clearer lineage of models and better documentation and version control.** + +To that end, we: + +- Defined which models would be prioritized together with the client's team, +- Defined which approach would be used to refactor Alteryx workflows to dbt models, +- Audited refactored models to make sure they matched the outputs from the original Alteryx workflow, and +- Replaced clients' data sources to the dbt refactored models. + +We hope that our experience can be useful for analytics engineers who are looking for a high-level framework to help in the transition from Alteryx workflows to dbt, and that it can help them to see the bigger picture in model refactoring. + +## Who isn't this post for? + +While we feel that dbt is a better transformation tool than Alteryx for most use cases, we acknowledge that a migration from Alteryx to dbt isn’t appropriate for everyone. Alteryx is designed for data analysts, but its capabilities are well-suited for business users, including marketing, sales, accounting, and HR. Alteryx may be a good enough tool when: + +- You have a small number of transformations +- The transformations are relatively simple +- Transformations don't need to run frequently +- You want non-technical users to manage this process + +Focusing more on data pipeline visibility and a friendlier user experience, Alteryx excels while working with smaller, more understandable data flows, where the Analytics Engineer (AE) can really visualize how the data is being transformed from the source all the way downstream to each output. + +When it comes to handling complex data structures, dbt has several features that make it superior to Alteryx. As we will see ahead with more details, in a data stack transition context, when long and complex data flows are common, dbt is often faster than Alteryx. That happens for a few reasons (**Table 2**): + +| Aspect | dbt | Alteryx | +| --- | --- | --- | +| Development experience | Command-line interface and IDE | Graphical user interface | +| Goal | Designed for data transformation and modeling | Data manipulation and analysis capabilities | +| Optimization | Takes advantage of query optimization capabilities | It does not reuse the same source that has already been executed by a model and runs it again | +| Run logic | Processes only changed data for large data sets (incremental run) | Processes all data every time it is run | + +*
**Table 2** — High-level comparison between dbt and Alteryx
* + +## A step-by-step guide on how we moved Alteryx workflows into dbt models + +### Case description + +This blog post reports a consulting project for a major client at Indicium Tech®, which will be kept anonymous. The client is a global technology company that specializes in providing enterprise content management and automation solutions. Several data analytic softwares were implemented by the organization to store and analyze data. Because the data transformation step is not concentrated in one single software, analyzing and transforming data has gotten increasingly complex and expensive over time. Especially, because the company purchased many data transformation tools (such as Alteryx, Tableau Prep, Power BI and SQL Server Stored Procedures) that were used across different teams. This hampered having one single source of truth and a centralized data transformation platform. + +When the client hired Indicium, they had dozens of Alteryx workflows built and running daily solely for the marketing team, which was the focus of the project. For the marketing team, the Alteryx workflows had to be executed in the correct order since they were interdependent, which means one Alteryx workflow used the outcome of the previous one, and so on. The main Alteryx workflows run daily by the marketing team took about 6 hours to run. Another important aspect to consider was that if a model had not finished running when the next one downstream began to run, the data would be incomplete, requiring the workflow to be run again. The execution of all models was usually scheduled to run overnight and by early morning, so the data would be up to date the next day. But if there was an error the night before, the data would be incorrect or out of date. **Figure 3** exemplifies the scheduler. + + + +Data lineage was a point that added a lot of extra labor because it was difficult to identify which models were dependent on others with so many Alteryx workflows built. When the number of workflows increased, it required a long time to create a view of that lineage in another software. So, if a column's name changed in a model due to a change in the model's source, the marketing analysts would have to map which downstream models were impacted by such change to make the necessary adjustments. Because model lineage was mapped manually, it was a challenge to keep it up to date. + +One of our main objectives was to refactor the Alteryx workflows that the marketing team utilized every day. As you may already suspect, this refactoring was done by creating models in dbt. The construction and description of how this refactoring was done is presented next. + +### How we refactored (a step-by-step guide based on our experience) + +Below we provide a high-level framework with the steps we followed to refactor the Alteryx workflows into dbt: + +![Figure 4 — Steps followed for Alteryx to dbt model refactoring](/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure4.png) +*
Figure 4 — Steps followed for Alteryx to dbt model refactoring
* + +#### Step 1: Start by refactoring smaller Alteryx workflows and then move on to more complex ones + +Understanding where to begin the refactoring process is very important, as it directly impacts the client's perception of value delivery. For some clients, it may be better to start with minor models to understand the best approach to model refactoring. Starting with shorter and less complex Alteryx workflows can be a way of creating a proof of concept and having small/quick wins. Also, this approach can be used to provide evidence of dbt's superior run performance for skeptical clients. + +On the other hand, some clients may prefer to start with their most important or most used models to have the major business intelligence reports running on dbt as soon as possible. Although this approach allows for greater value delivery, it will probably take longer for AEs to refactor these workflows due to their complexity of transformations and steps involved in the workflow. + +We adopted a mixed approach by starting with one or two simpler workflows to gain experience and confidence with the refactoring process and then moving on to refactoring the client's most important workflows. This approach provides for a great balance between time and value delivery. + +#### Step 2: Identify the source models and refactor the Alteryx from left to right + +The first step is to validate all data sources and create one common table expression (CTE) for each source referenced in the specific Alteryx workflow being refactored, so that it is easy to reuse them throughout the model. + +It is essential to click on each data source (the green book icons on the leftmost side of **Figure 5**) and examine whether any transformations have been done inside that data source query. It is very common for a source icon to contain more than one data source or filter, which is why this step is important. The next step is to follow the workflow and transcribe the transformations into SQL queries in the dbt models to replicate the same data transformations as in the Alteryx workflow. + + + + +For this step, we identified which operators were used in the data source (for example, joining data, order columns, group by, etc). Usually the Alteryx operators are pretty self-explanatory and all the information needed for understanding appears on the left side of the menu. We also checked the documentation to understand how each Alteryx operator works behind the scenes. + +We followed dbt Labs' guide on how to refactor legacy SQL queries in dbt and some [best practices](https://docs.getdbt.com/guides/migration/tools/refactoring-legacy-sql). After we finished refactoring all the Alteryx workflows, we checked if the Alteryx output matched the output of the refactored model built on dbt. + +#### Step 3: Use the `audit_helper` package to audit refactored data models + +Auditing large models, with sometimes dozens of columns and millions of rows, can be a really tough task to execute manually. It is humanly impossible to validate columns one by one, joining tables by their primary key and measuring compatibility through hand-made SQL. Fortunately, there are a couple of dbt packages built entirely for the purpose of automating this process! + +In this project, we used [the `audit_helper` package](https://github.com/dbt-labs/dbt-audit-helper), because it provides more robust auditing macros and offers more automation possibilities for our use case. To that end, we needed to have both the legacy Alteryx workflow output table and the refactored dbt model materialized in the project’s data warehouse. Then we used the macros available in `audit_helper` to compare query results, data types, column values, row numbers and many more things that are available within the package. For an in-depth explanation and tutorial on how to use the `audit_helper` package, check out [this blog post](https://docs.getdbt.com/blog/audit-helper-for-migration). **Figure 6** graphically illustrates the validation logic behind audit_helper. + + + +#### Step 4: Duplicate reports and connect them to the dbt refactored models + +With the models refactored and audited, it is time to plug them in the BI report tool. Although some will be brave enough to plug the refactored model directly into the original BI report, we recommend duplicating the BI report and connecting this replica to the newly refactored dbt model. + +This approach allows you to compare the two reports side by side and check how data behaves in the visualizations created. Also, it can function as a step to double check that values match in refactored and legacy tables. Therefore, at times, it may be necessary to go back to the transformation step and cast column types or change a business rule, for example. + +## The gains of the refactoring process + +Successfully converting an entire set of data workflows from the Alteryx engine to dbt is surely not a trivial task, but the implementation of this framework, as a result of a trial-and-error learning process from the team, allowed us to accelerate this process, while its data auditing focus enabled delivering data with visible and automated quality assurance. + +The conversion proved to be of great value to the client due to three main aspects of the new dbt-based data stack, which were observed by both teams: + +- Incredibly shortened run time: Perhaps the most impressive result obtained, the total run time of the marketing team’s data workflow was reduced from more than **6 hours** to just **9 minutes**. This represents a **run time reduction of more than** **40x**. Much of this comes from transitioning from SQL Server on-premises computing to Snowflake cloud computing, dbt’s agile SQL compilation and materialization offers, and the sequential lineage based execution (see Figure 7). +- Improved workflow visibility: dbt’s support for documentation and testing, associated with dbt Cloud, allows for great visibility of the workflow’s lineage execution, accelerating errors and data inconsistencies identification and troubleshooting. More than once, our team was able to identify the impact of one column’s logic alteration in downstream models much earlier than these Alteryx models. +- Workflow simplification: dbt’s modularized approach of data modeling, aside from accelerating total run time of the data workflow, simplified the construction of new tables, based on the already existing modules, and improved code readability. + + + +As we can see, refactoring Alteryx to dbt was an important step in the direction of data availability, and allowed for much more agile processes for the client’s data team. With less time dedicated to manually executing sequential Alteryx workflows that took hours to complete, and searching for errors in each individual file, the analysts could focus on what they do best: **getting insights from the data and generating value from them**. + +## References + +> [Migrating from Stored Procedures to dbt](https://docs.getdbt.com/blog/migrating-from-stored-procs) +> +> +> [Audit_helper in dbt: Bringing data auditing to a higher level](https://docs.getdbt.com/blog/audit-helper-for-migration) +> +> [Refactoring legacy SQL to dbt](https://docs.getdbt.com/guides/migration/tools/refactoring-legacy-sql) diff --git a/website/blog/2023-04-26-deprecating-dbt-metrics.md b/website/blog/2023-04-26-deprecating-dbt-metrics.md new file mode 100644 index 00000000000..bf23bb992ad --- /dev/null +++ b/website/blog/2023-04-26-deprecating-dbt-metrics.md @@ -0,0 +1,71 @@ +--- +title: "Why we're deprecating the dbt_metrics package" +description: "We are bidding adieu to dbt_metrics and moving forward with MetricFlow! Discover how this new source-available project lays the foundation for the dbt Semantic Layer. Let's dive in!" +slug: deprecating-dbt-metrics + +authors: [callum_mccann] + +hide_table_of_contents: false + +date: 2023-04-26 +is_featured: true +--- + + +Hello, my dear data people. + +If you haven’t read [Nick & Roxi’s blog post about what’s coming in the future of the dbt Semantic Layer](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), I highly recommend you read through that, as it gives helpful context around what the future holds. + +With that said, it has come time for us to bid adieu to our beloved dbt_metrics package. **Upon the release of dbt-core v1.6 in late July, we will be deprecating support for the dbt_metrics package.** + + + +With the upcoming integration with MetricFlow, we're on the cusp of a new era for the dbt Semantic Layer. And if we’re honest with ourselves, it is a brighter future than the dbt_metrics package would have been able to support. + +We know that some of you have been using the package to help serve your organizations. **We will be providing migration scripts to help reduce the complexity of upgrading to the new specs.** + +If you’re interested in getting ahead of the transition, we recommend that you start familiarizing yourself with the new spec in this [Github Discussion](https://github.com/dbt-labs/dbt-core/discussions/7456). If you’re even more curious, you can read the MetricFlow documentation, although some elements will change in the coming months as we work on the integration. I’m confident that, with a little time, you'll come to love MetricFlow far more than you did dbt_metrics. + +Before we get into the technical details around why we’re deprecating the package, I want to give a big thank you to everyone from the community who used or contributed to the dbt_metrics package over the last year. From the bottom of my heart, I’ve loved hearing from all of you and discussing ways to help you solve your organization's data problems. It’s been a dream come true to work in this area and it wouldn’t have been possible without all of you lovely folks. + +## Why we’re deprecating + +With all that said, let’s dig into the exact reasons why we’re deprecating the package. + +**Duplicative functionality:** + +MetricFlow and dbt_metrics share a common purpose – a simplified and standard way to generate SQL to query metrics. However, MetricFlow does this much more effectively, with advanced features and a more robust foundation. + +**The Jinja trap:** + +Relying on Jinja macros for generating SQL in dbt_metrics proved to be sub-optimal and restrictive. This approach limited the package's potential and made it difficult to scale and customize for each new metric type and piece of functionality that we wanted to add. In contrast, MetricFlow's Python implementation offers a far more flexible and expressive solution. + +- *To further emphasize that MetricFlow is years ahead of dbt_metrics, it actually used to use Jinja templates but moved away from them in a complete overhaul in order to increase flexibility.* + +**Focusing on a dynamic semantic layer:** + +We feel strongly that the power of a Semantic Layer lies in its ability to serve the organization dynamically - to answer the user’s first, second, and third questions. As such, **MetricFlow will not support materializing metric queries as static database objects.** Instead, we will focus on caching functionality that increases performance without reducing capability. + +**Advanced SQL generation with intelligent joins:** + +MetricFlow brings advanced SQL generation to the table and leaves dbt_metrics far behind in terms of capabilities. One of the key features is its sophisticated handling of joins, which has been the number one feature requested by folks within the community. With MetricFlow you can effortlessly access all valid dimensions for your metrics on the fly, even when they are defined in different semantic models. Moreover, you can construct metrics using measures from multiple semantic models. + +- *How does it do this?* It creates a graph with semantic models as nodes and join paths as edges, automatically generating the appropriate join type based on the entity types. This approach helps avoid fan-out or chasm joins to ensure that the results are correct. **Trying to do this within Jinja would have been a fool's errand.** + +**First-rate validation with a three-step approach**: + +MetricFlow offers comprehensive validation for the semantics, making sure that your data models are sound and reliable. This process involves three key steps: + +1. **Data warehouse validation**: To further validate your semantics, MetricFlow checks if the semantic definitions exist in your data warehouse. By running queries against your data warehouse, it ensures that the generated SQL for semantic models, dimensions, and metrics will execute as intended. + +2. **Semantic validation**: After building your semantic models, MetricFlow runs a suite of tests to ensure that the semantics make sense. For example, it checks if measure names are unique or if metrics reference existing measures. This helps maintain the integrity and consistency of the semantic manifest. + +3. **Continuous Integration (CI)**: By integrating these validation steps into your CI pipeline, MetricFlow helps catch any issues early in the development process. This results in fewer surprises, a more reliable dbt pipeline, and increased confidence in your DAG. + +**A flexible foundation for integration**: + +MetricFlow is a more flexible foundation through which we can provide our integration partners with the tools to build differentiated experiences. This opens the door to exciting collaborations and makes it easier for our partners to build. + +So as we bid farewell to the `dbt_metrics` package, we say hello to MetricFlow and all the opportunities and advancements that come with it. This is the next era for the Semantic Layer. + +With the migration tools and resources coming soon, we're committed to supporting you through this transition. We extend our gratitude to everyone who has used, contributed to, or supported dbt_metrics, and we invite you all to join us on this exciting journey toward a brighter future in data analytics. If you're interested in discussing, please come on over to [#dbt-core-metrics](https://getdbt.slack.com/archives/C02CCBBBR1D)! \ No newline at end of file diff --git a/website/blog/2023-05-01-evolving-data-engineer-craft.md b/website/blog/2023-05-01-evolving-data-engineer-craft.md new file mode 100644 index 00000000000..a3113240227 --- /dev/null +++ b/website/blog/2023-05-01-evolving-data-engineer-craft.md @@ -0,0 +1,193 @@ +--- +title: "Data engineers + dbt v1.5: Evolving the craft for scale" +description: "Where have we been? Where are we going? What does the future hold for data engineers? How can dbt help you get there?" +slug: evolving-data-engineer-craft + +authors: [sung_chung, kira_furuichi] + +hide_table_of_contents: false + +date: 2023-05-01 +is_featured: true +--- + +# Data Engineers + dbt 1.5: Evolving the Craft for Scale + +I, Sung, entered the data industry by chance in Fall 2014. I was using this thing called audit command language (ACL) to automate debits equal credits for accounting analytics (yes, it’s as tedious as it sounds). I remember working my butt off in a hotel room in Des Moines, Iowa where the most interesting thing there was a Panda Express. It was late in the AM. I’m thinking about 2 am. And I took a step back and thought to myself, “Why am I working so hard for something that I just don’t care about with tools that hurt more than help?” + + +I did lots of soul searching and deduced I loved analytics, but not the job and subject matter. My next gig was in consulting where I bootstrapped my way into data engineering and had to learn the whole gamut below. + +| Tech skills | Place in tech stack | Why it mattered at the time | +| --- | --- | --- | +| Airflow | Orchestrator | The industry standard to run data pipelines | +| SQL | Lingua franca of data transformation | My business logic codified (think: revenue by month) | +| Python | Lingua franca of data engineering | It’s how you use Airflow | +| Terraform | Get infra ready for airflow Kubernetes cluster | Infrastructure automation | +| Google Cloud | Cloud | Big customer footprint | +| Amazon Web Services | Cloud | Big customer footprint | +| dbt | The T in ELT | The reason people finally test their data with SQL | +| BigQuery | Cloud data warehouse | A lot of my clients used this | + +These are still great skills to learn and maintain even six years after I learned them in 2017. Armed with them, I finally saw the magic of the modern data stack and what problems it could solve. It took my questionable little pipelines back in 2014 and made them gleam with a new shine (and reliability). I felt like what the cool kids call a **[10x data engineer](https://knowyourmeme.com/memes/10x-engineer)**. + +However, as my skills grew, so did the problems. Big Data eventually turned into data swamps, the charm of using these great tools lost its shine, and my excitement gradually gave way to exhaustion. Not because they’re bad tools, but because the problem space of managing giant mounds of data required something data tools are still wrestling with today: scale and control. I kept looking for goldilocks projects to save/make money for companies. I wanted to build prestige in my career. But in practice, I was babysitting brittle data pipelines. To enable dozens of people was mind-numbing, much less hundreds of data analysts to all work elegantly together. + +I’m still juggling my messy data pipelines and wondering what’s signal vs. noise in how to evolve my skills. I feel like a .5x data engineer. It’s like taking 2 steps forward and 1 big step back. So my question becomes: + +> Why am I working so hard for data pipelines no one uses and **scale** that hurts more than helps? +> + +I take a step back and realize my job is playing more defense than offense. My KPIs are less about revenue and cost impact, and more about how many times I get yelled at this week and making that number go down and to the right. This was/is my story, and I have a strong feeling it’s yours too. + +And I know there isn’t a silver bullet to solve all the above, but I do want to see momentum in the right direction. Where have tools like dbt met me and how exactly does it need to meet me going forward? + +## Where dbt is meeting data engineers, and where it’s going + +The joys and pains of data engineering and analytics engineering are real; the win you get when a stakeholder eventually contributes to a dbt model; the loss when a pipeline breaks, and the onslaught of Slack notifications that come hurling your way. dbt transformed ;) the way data teams interact with their data, and the people that depend on them. When dbt was first developed, it aimed to bring the best practices in software engineering to the field of analytics—this meant version-controlled, rigorously tested, and collaborative data transformations. dbt brought code-based tests, integrated CI, efficient development with packages, and global docs. These features have been foundational to the way data teams work, and have allowed data engineers to focus on the most important part of their job: building data pipelines that power the business. + +### Building the future: Where data engineers are going with dbt + +As dbt has grown, so has the complexity of dbt projects. Tristan has [written extensively about this](https://www.getdbt.com/blog/analytics-engineering-next-step-forwards/), but a few years ago, a *big* dbt project was ~500 models. Today, there are many organizations with *thousands* of dbt models. This level of complexity and organization has changed the landscape of interesting problems in the analytics engineering space; dependency graphs become increasingly large, identifying ownership becomes murky, and the barrier to contribution is raised. You can see how larger, complex data teams approach this today in this public GitHub discussion: [https://github.com/dbt-labs/dbt-core/discussions/5244](https://github.com/dbt-labs/dbt-core/discussions/5244). + +v1.5 features aim to support the growth of those type of dbt projects by going back to the roots of software engineering best practices—dbt v1.5 is bringing service oriented architectures to a dbt project near you. dbt’s v1.5 features of contracts, model versions, and group permissions—alongside all of the foundational “dbtonic” things—culminate in a toolkit that will allow data engineers, alongside analysts, to build long-term, scalable, and efficient dbt projects. + +Below, we’ll breakdown where dbt v1.5 is evolving scale and control in your work, and how it’ll elevate your daily practice of data engineering (and remove some of those panicked Slack messages 😉). + +[**Model Contracts**](https://docs.getdbt.com/docs/collaborate/publish/model-contracts) + +- **Problems you’re living through**: I can’t guarantee the shape of my data (think: column names, data types, no blank values) without triple checking my work and running `dbt build` a couple times and eyeballing my data. I get tired of doing this everyday, so I end up not doing it in the long-term. +- **Solution**: Model contracts allow you to define how a model should conform—which columns will never be `null`, which columns will always be a certain type, and more—all within a `YAML` file. These contracts are meant to be binding artifacts that create levels of accountability between the folks that create a model with a contract, and the downstream consumers of that model. +- **How it will change your daily work**: The lingering doubt—*”can I trust this table?”*—is removed with a model contract. These contracts create systems of accountability, governance, and reliability, ultimately allowing people to feel confident in the models they reference. With a contract, you shouldn’t have to test if the primary key from an upstream reference is null, the contract stated as so—and that contract is law. + +```yaml +# snowflake contract example +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: id + data_type: integer + description: hello + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include in DDL + - type: check # not supported -- will warn & exclude from DDL + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced + - name: customer_name + data_type: text + - name: first_transaction_date + data_type: date +``` + +```sql +--SQL run against database +create or replace transient table ..dim_customers +( + id integer not null primary key, + customer_name text, + first_transaction_date date +) +as +( +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +); +``` + +[**Model Versions**](https://docs.getdbt.com/docs/collaborate/publish/model-versions) + +- **Problems you’re living through**: I change my vital model `fct_orders.sql` every week, and many people rely on this for their work. However, I keep getting doubtful questions on what’s changed since my last update, and I don’t have great way to instill confidence that this will/won’t break the way they rely on it. +- **What it is**: Model versions in v1.5 allow you to create, specify, and reference versions of models. Core reporting models can now be updated and deprecated following software engineering practices and create systems of accountability between data creators and data consumers. +- **How it will change your daily work**: Not every model is going to need to be versioned, but for core models that power your business intelligence, *power your data team*, you will now have the option to create multiple versions of a model and implement breaking changes in a more realistic and accountable way. Say I’m the primary owner of Core Data Team dbt `Project A`, and inside that project contains a core `dim_customers` model that powers the way finance, customer success, and marketing analyze customer data and CLV (customer lifetime value). I need to make a breaking change to `dim_customers`—CLV is going to be removed in favor of a more complex ROI value. Finance team uses the existing CLV value for cohort analysis and other reports, but understands that the new ROI column may be more favorable over time. However, it takes time to transition those reports and systems to conform to the ROI values, so `Project A` can develop a `dim_customers_v2` that drops LTV in favor for the new ROI. + +```yaml +models: + - name: dim_customers + latest_version: 2 + config: + contract: + enforced: true + columns: + - name: id + data_type: integer + description: hello + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include in DDL + - type: check # not supported -- will warn & exclude from DDL + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced, so also verify with a dbt test + - name: customer_name + data_type: text + - name: first_transaction_date + data_type: date + versions: + - v: 2 + columns: + - include: '*' + exclude: ['first_transaction_date'] + - v: 1 + columns: + - include: '*' + defined_in: dim_customers +``` + +```sql +select * from {{ ref('dim_customers', v=2) }} +``` + +[**Model Access**](https://docs.getdbt.com/docs/collaborate/govern/model-access) + +- **Problems you’re living through**: I split out my dbt project subdirectories in sales, marketing, and finance, and have a large team referencing dbt models across those folders everyday. However, I notice a lot of the references use staging tables that are incomplete and shouldn’t be referenced. I don’t have a good way to prevent inappropriate references. +- **What it is**: You can now define public, private, and protected models within dbt project subdirectories and models so your teammates only touch what they’re supposed to! +- **How it will change your daily work**: The exhaustive sighs of telling your teammates, “you’re not supposed to use that model” is now gone. dbt practices energetic boundaries between multiple files and subfolders and tells your teammates why they can’t reference a specific dbt model. + +```yaml +models: + - name: finance_model + access: private + group: finance + - name: marketing_model + group: marketing +``` + +```sql +--models/marketing/marketing_model.sql + +select * from {{ ref('finance_model') }} +``` + +```bash +$ dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` + +## What does winning with v1.5 look like for you, the data engineer? +This is great and all, but how do we know if these features are working to make your work more streamlined, intuitive, or easier? Because you’re probably wondering, “Are we trying to inflate v1.5 as this silver bullet to solve all data transformation problems?” Short answer: “No”. We just want to have less headaches when it comes to governing and scaling your data work, and bring back the joy working with data. + +If it’s anything like the imagined future we have for you below, then we all win: + +| 😊 Emotional Victory | 📝 Future Resume Bullet Points | +| --- | --- | +| Invite people into the magic of working with and in data | - Scaled from 10 to 500 users in dbt Cloud and, on average, onboarded new users in 1 week | +| Getting one step closer to self-service without the eye-rolling | - Established uptime of 99.99% with core metrics like revenue, marketing, churn with model contracts and dbt Semantic Layer and reduced data validation efforts by the business by 5 hours per week | +| The tedious admin work melts away and you get that breath of relief knowing people aren’t “moving fast and breaking things”…as much | - Reduced 5% of all transform spend with less code implementing data model contracts with 10% more quality. Removed 4 hours per person per week in teams across finance, marketing, sales by reducing duplicative development by 20% and reduced basic context gathering | +| Get the taste of being offensive vs. defensive with your work | - Used dbt to drive revenue (think: embedded data products) and added a new SKU earning $500,000 per year | + +## So, what’s next? + +- Try out v1.5! Let us know how the ergonomics and functionality of model contracts, versions, and group permissions feel to you. Open up an issue if you notice any bugs. +- Watch the [Community recording on contracts](https://www.loom.com/share/375dee38aa9448deaed860a06487f8ff)—a great way to see them live in action—or [watch the recording from Staging](https://www.getdbt.com/resources/staging-april-2023/) to see dbt v1.5 features come to life! +- Comment directly in this post on thoughts of v1.5 or this article! +- Join the [#multi-project channel](https://getdbt.slack.com/archives/C04FP5LQA15) in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/)—start sparking up conversations with people like myself around the pains and gains of multi-deployment dbt projects. Validate if the constructs in v1.5 translate well to a multi-project future. \ No newline at end of file diff --git a/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md b/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md new file mode 100644 index 00000000000..d436df2628a --- /dev/null +++ b/website/blog/2023-05-02-modeling-ragged-time-varying-hierarchies.md @@ -0,0 +1,448 @@ +--- +title: "Modeling ragged time-varying hierarchies" +description: "Learn how to maximize the utility of complex hierarchical data in your analytics warehouse." +slug: modeling-ragged-time-varying-hierarchies + +authors: [sterling_paramore] + +tags: [analytics craft] +hide_table_of_contents: false + +date: 2023-05-02 +is_featured: true +--- + +This article covers an approach to handling time-varying ragged hierarchies in a dimensional model. These kinds of data structures are commonly found in manufacturing, where components of a product have both parents and children of arbitrary depth and those components may be replaced over the product's lifetime. The strategy described here simplifies many common types of analytical and reporting queries. + +To help visualize this data, we're going to pretend we are a company that manufactures and rents out eBikes in a ride share application. When we build a bike, we keep track of the serial numbers of the components that make up the bike. Any time something breaks and needs to be replaced, we track the old parts that were removed and the new parts that were installed. We also precisely track the mileage accumulated on each of our bikes. Our primary analytical goal is to be able to report on the expected lifetime of each component, so we can prioritize improving that component and reduce costly maintenance. + +## Data model + +Obviously, a real bike could have a hundred or more separate components. To keep things simple for this article, let's just consider the bike, the frame, a wheel, the wheel rim, tire, and tube. Our component hierarchy looks like: + + + +This hierarchy is *ragged* because different paths through the hierarchy terminate at different depths. It is *time-varying* because specific components can be added and removed. + +Now let's take a look at how this data is represented in our source data systems and how it can be transformed to make analytics queries easier. + +### Transactional model + +Our ERP system (Enterprise Resource Planning) contains records that log when a specific component serial number (`component_id`) was installed in or removed from a parent assembly component (`assembly_id`). The top-most assembly component is the eBike itself, which has no parent assembly. So when an eBike (specifically, the eBike with serial number "Bike-1") is originally constructed, the ERP system would contain records that look like the following. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Frame-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | | +| Wheel-1 | Rim-1 | 2023-01-01 | | +| Wheel-1 | Tire-1 | 2023-01-01 | | +| Tire-1 | Tube-1 | 2023-01-01 | | + +Now let's suppose this bike has been ridden for a while, and on June 1, the user of the bike reported a flat tire. A service technician then went to the site, replaced the tube that was in the wheel, and installed a new one. They logged this in the ERP system, causing one record to be updated with a `removed_at` date, and another record to be created with the new tube `component_id`. + + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| ... | ... | ... | ... | +| Tire-1 | Tube-1 | 2023-01-01 | 2023-06-01 | +| Tire-1 | Tube-2 | 2023-06-01 | | +| ... | ... | ... | ... | + +After a few more months, there is a small crash. Don't worry, everyone's OK! However, the wheel (`Wheel-1`)is totally broken and must be replaced (with `Wheel-2`). When the technician updates the ERP, the entire hierarchy under the replaced wheel is also updated, as shown below. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Tire-1 | 2023-01-01 | 2023-08-01 | +| Tire-1 | Tube-2 | 2023-06-01 | 2023-08-01 | # Note that this part has different install date +| Bike-1 | Wheel-2 | 2023-08-01 | | +| Wheel-2 | Rim-2 | 2023-08-01 | | +| Wheel-2 | Tire-2 | 2023-08-01 | | +| Tire-2 | Tube-3 | 2023-08-01 | | + + +After all of the above updates and additions, our ERP data looks like the following. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Frame-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | 2023-08-01 | +| Wheel-1 | Tire-1 | 2023-01-01 | 2023-08-01 | +| Tire-1 | Tube-1 | 2023-01-01 | 2023-06-01 | +| Tire-1 | Tube-2 | 2023-06-01 | 2023-08-01 | +| Bike-1 | Wheel-2 | 2023-08-01 | | +| Wheel-2 | Rim-2 | 2023-08-01 | | +| Wheel-2 | Tire-2 | 2023-08-01 | | +| Tire-2 | Tube-3 | 2023-08-01 | | + +So that's all fine and good from the perspective of the ERP system. But this data structure can be difficult to work with if we want to generate reports that calculate the total mileage accumulated on various components, or the average mileage of a particular component type, or how one component type might affect the lifetime of another component. + +### Multivalued dimensional model + +In dimensional modeling, we have *fact* tables that contain measurements and *dimension* tables that contain the context for those measurements (attributes). In our eBike data warehouse, we have a fact table that contains one record for each eBike for each day it is ridden and the measured mileage accumulated during rides that day. This fact table contains *surrogate key* columns, indicated by the `_sk` suffix. These are usually system-generated keys used to join to other tables in the database; the specific values of these keys are not important. + +**`fct_daily_mileage`:** + +| `bike_sk` | `component_sk` | `ride_at` | `miles` | +| - | - | - | - | +| bsk1 | csk1 | 2023-01-01 | 3 | +| bsk1 | csk1 | 2023-01-02 | 2 | +| bsk1 | csk1 | 2023-01-03 | 0 | +| bsk1 | csk1 | 2023-01-04 | 0 | +| ... | ... | ... | ... | +| bsk1 | csk3 | 2023-08-01 | 7 | +| bsk1 | csk3 | 2023-08-02 | 8 | +| bsk1 | csk3 | 2023-08-03 | 4 | + +One of the dimension tables is a simple table containing information about the individual bikes we have manufactured. + +**`dim_bikes`:** + +| `bike_sk` | `bike_id` | `color` | `model_name` | +| - | - | - | - | +| bsk1 | Bike-1 | Orange | Wyld Stallyn | + + +There is a simple many-to-one relationship between `fct_daily_mileage` and `dim_bikes`. If we need to calculate the total mileage accumulated for each bike in our entire fleet of eBikes, we just join the two tables and aggregate on the `miles` measurement. + +```sql +select + dim_bikes.bike_id, + sum(fct_daily_mileage.miles) as miles +from + fct_daily_mileage +inner join + dim_bikes + on + fct_daily_mileage.bike_sk = dim_bikes.bike_sk +group by + 1 +``` + +Extending this to determine if orange bikes get more use than red bikes or whether certain models are preferred are similarly straightforward queries. + +Dealing with all of the components is more complicated because there are many components installed on the same day. The relationship between days when the bikes are ridden and the components is thus *multivalued*. In `dim_bikes`, there is one record per bike and surrogate key. In our components dimension will have multiple records with the same surrogate key and will therefore be a *multivalued dimension*. Of course, to make things even more complicated, the components can change from day to day. To construct the multivalued dimension table, we break down the time-varying component hierarchy into distinct ranges of time where all of the components in a particular bike remain constant. At specific points in time where the components are changed, a new surrogate key is created. The final dimension table for our example above looks like the following, where the `valid_from_at` and `valid_to_at` represent the begin and end of a range of time where all the components of an eBike remain unchanged. + + +**`mdim_components`:** + +| `component_sk` | `assembly_id` | `component_id` | `depth` | `installed_at` | `removed_at` | `valid_from_at` | `valid_to_at` | +| - | - | - | - | - | - | - | - | +| csk1 | | Bike-1 | 0 | 2023-01-01 | | 2023-01-01 | 2023-06-01 | +| csk1 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-01-01 | 2023-06-01 | +| csk1 | Bike-1 | Wheel-1 | 1 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Wheel-1 | Rim-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Wheel-1 | Tire-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-01-01 | 2023-06-01 | +| csk1 | Tire-1 | Tube-1 | 3 | 2023-01-01 | 2023-06-01 | 2023-01-01 | 2023-06-01 | +| csk2 | | Bike-1 | 0 | 2023-01-01 | | 2023-06-01 | 2023-08-01 | +| csk2 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-06-01 | 2023-08-01 | +| csk2 | Bike-1 | Wheel-1 | 1 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Wheel-1 | Rim-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Wheel-1 | Tire-1 | 2 | 2023-01-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk2 | Tire-1 | Tube-2 | 3 | 2023-06-01 | 2023-08-01 | 2023-06-01 | 2023-08-01 | +| csk3 | | Bike-1 | 0 | 2023-01-01 | | 2023-08-01 | | +| csk3 | Bike-1 | Frame-1 | 1 | 2023-01-01 | | 2023-08-01 | | +| csk3 | Bike-1 | Wheel-2 | 1 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Wheel-2 | Rim-2 | 2 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Wheel-2 | Tire-2 | 2 | 2023-08-01 | | 2023-08-01 | | +| csk3 | Tire-2 | Tube-3 | 3 | 2023-08-01 | | 2023-08-01 | | + +Now, let's look at how this structure can help in writing queries. In a later section of this article, we'll examine the SQL code that can take our ERP table and convert it into this dimensional model. + +### Mileage for a component + +Suppose we wanted to know the total mileage accumulated on "Wheel-1". The SQL code for determining this is very similar to that for determining the mileage for a given bike. + +```sql +select + mdim_components.component_id, + sum(fct_daily_mileage.miles) as miles +from + fct_daily_mileage +inner join + mdim_components + on + fct_daily_mileage.component_sk = mdim_components.component_sk +group by + 1 +where + component_id = 'Wheel-1' +``` + +:::caution + +One thing to be *very cautious* about when working with multivalued dimensions is that you need to be careful interpreting aggregations. For example, suppose we chose to aggregate on `top_assembly_id` (to reduce clutter, this field is not shown in the data model above because it is just "Bike-1" for each record). For this aggregation, we would be over-counting the total mileage on that top assembly because the join would result in a Cartesian product and thus we'd get a ["fan-out" situation](https://community.looker.com/technical-tips-tricks-1021/the-problem-of-sql-fanouts-30232). +::: + +### Bonus: Finding components installed at the same time as other components + +This structure simplifies other kinds of interesting analysis. Suppose we wanted to start exploring how one component affects another, like whether certain brands of tube needed to be replaced more often if they were in a new brand of tire. We can do this by partitioning the data into the segments of time where the components are not changing and looking for other components installed at the same time. For example, to find all of the components that were ever installed at the same time "Tube-3" was installed, we can collect them with a simple window function. We could then use the results of this query in a regression or other type of statistical analysis. + +```sql +select distinct + component_id +from + mdim_components +qualify + sum(iff(component_id = 'Tube-3', 1, 0)) over (partition by valid_from_at, valid_to_at) > 0 +``` + + +## SQL code to build the dimensional model + +Now we get to the fun part! This section shows how to take the ERP source data and turn it into the multivalued dimensional model. This SQL code was written and tested using Snowflake, but should be adaptable to other dialects. + +### Traversing the hierarchy + +The first step will be to traverse the hierarchy of components to find all components that belong to the same top assembly. In our example above, we only had one bike and thus just one top assembly; in a real system, there will be many (and we may even swap components between different top assemblies!). + +The key here is to use a [recursive join](https://docs.snowflake.com/en/sql-reference/constructs/with#recursive-clause) to move from the top of the hierarchy to all children and grandchildren. The top of the hierarchy is easy to identify because they are the only records without any parents. + +```sql +with recursive +-- Contains our source data with records that link a child to a parent +components as ( + select + *, + -- Valid dates start as installed/removed, but may be modified as we traverse the hierarchy below + installed_at as valid_from_at, + removed_at as valid_to_at + from + erp_components +), + +-- Get all the source records that are at the top of hierarchy +top_assemblies as ( + select * from components where assembly_id is null +), + +-- This is where the recursion happens that traverses the hierarchy +traversal as ( + -- Start at the top of hierarchy + select + -- Keep track of the depth as we traverse down + 0 as component_hierarchy_depth, + -- Flag to determine if we've entered a circular relationship + false as is_circular, + -- Define an array that will keep track of all of the ancestors of a component + [component_id] as component_trace, + -- At the top of the hierarchy, the component is the top assembly + component_id as top_assembly_id, + + assembly_id, + component_id, + + installed_at, + removed_at, + valid_from_at, + valid_to_at + from + top_assemblies + + union all + + -- Join the current layer of the hierarchy with the next layer down by linking + -- the current component id to the assembly id of the child + select + traversal.component_hierarchy_depth + 1 as component_hierarchy_depth, + -- Check for any circular dependencies + array_contains(components.component_id::variant, traversal.component_trace) as is_circular, + -- Append trace array + array_append(traversal.component_trace, components.component_id) as component_trace, + -- Keep track of the top of the assembly + traversal.top_assembly_id, + + components.assembly_id, + components.component_id, + + components.installed_at, + components.removed_at, + -- As we recurse down the hierarchy, only want to consider time ranges where both + -- parent and child are installed; so choose the latest "from" timestamp and the earliest "to". + greatest(traversal.valid_from_at, components.valid_from_at) as valid_from_at, + least(traversal.valid_to_at, components.valid_to_at) as valid_to_at + from + traversal + inner join + components + on + traversal.component_id = components.assembly_id + and + -- Exclude component assemblies that weren't installed at the same time + -- This may happen due to source data quality issues + ( + traversal.valid_from_at < components.valid_to_at + and + traversal.valid_to_at >= components.valid_from_at + ) + where + -- Stop if a circular hierarchy is detected + not array_contains(components.component_id::variant, traversal.component_trace) + -- There can be some bad data that might end up in hierarchies that are artificially extremely deep + and traversal.component_hierarchy_depth < 20 +), + +final as ( + -- Note that there may be duplicates at this point (thus "distinct"). + -- Duplicates can happen when a component's parent is moved from one grandparent to another. + -- At this point, we only traced the ancestry of a component, and fixed the valid/from dates + -- so that all child ranges are contained in parent ranges. + + select distinct * + from + traversal + where + -- Prevent zero-time (or less) associations from showing up + valid_from_at < valid_to_at +) + +select * from final +``` + +At the end of the above step, we have a table that looks very much like the `erp_components` that it used as the source, but with a few additional valuable columns: + +* `top_assembly_id` - This is the most important output of the hierarchy traversal. It ties all sub components to a their common parent. We'll use this in the next step to chop up the hierarchy into all the distinct ranges of time where the components that share a common top assembly are constant (and each distict range of time and `top_assembly_id` getting their own surrogate key). +* `component_hierarchy_depth` - Indicates how far removed a component is from the top assembly. +* `component_trace` - Contains an array of all the components linking this component to the top assembly. +* `valid_from_at`/`valid_to_at` - If you have really high-quality source data, these will be identical to `installed_at`/`removed_at`. However, in the real world, we've found cases where the installed and removal dates are not consistent between parent and child, either due to a data entry error or a technician forgetting to note when a component was removed. So for example, we may have a parent assembly that was removed along with all of its children, but only the parent assembly has `removed_at` populated. At this point, the `valid_from_at` and `valid_to_at` tidy up these kinds of scenarios. + +### Temporal range join + +The last step is perform a [temporal range join](https://discourse.getdbt.com/t/joining-snapshot-tables-time-range-based-joins/3226) between the top assembly and all of its descendents. This is what splits out all of the time-varying component changes into distinct ranges of time where the component hierarchy is constant. This range join makes use of [the dbt macro in this gist](https://gist.github.com/gnilrets/48886b4c8945dde1da13547c2373df73), the operation of which is out-of-scope for this article, but you are encouraged to investigate it and the discourse post mentioned earlier. + +```sql +-- Start with all of the assemblies at the top (hierarchy depth = 0) +with l0_assemblies as ( + select + top_assembly_id, + component_id, + -- Prep fields required for temporal range join + {{ dbt_utils.surrogate_key(['component_id', 'valid_from_at']) }} as dbt_scd_id, + valid_from_at as dbt_valid_from, + valid_to_at as dbt_valid_to + from + component_traversal + where + component_hierarchy_depth = 0 +), + +components as ( + select + top_assembly_id, + component_hierarchy_depth, + component_trace, + assembly_id, + component_id, + installed_at, + removed_at, + -- Prep fields required for temporal range join + {{ dbt_utils.surrogate_key(['component_trace', 'valid_from_at'])}} as dbt_scd_id, + valid_from_at as dbt_valid_from, + valid_to_at as dbt_valid_to + from + component_traversal +), + +-- Perform temporal range join +{{ + trange_join( + left_model='l0_assemblies', + left_fields=[ + 'top_assembly_id', + ], + left_primary_key='top_assembly_id', + right_models={ + 'components': { + 'fields': [ + 'component_hierarchy_depth', + 'component_trace', + 'assembly_id', + 'component_id', + 'installed_at', + 'removed_at', + ], + 'left_on': 'component_id', + 'right_on': 'top_assembly_id', + } + } + ) +}} + +select + surrogate_key, + top_assembly_id, + component_hierarchy_depth, + component_trace, + assembly_id, + component_id, + installed_at, + removed_at, + valid_from_at, + valid_to_at +from + trange_final +order by + top_assembly_id, + valid_from_at, + component_hierarchy_depth +``` + +## Bonus: component swap + +Before we go, let's investigate one other interesting scenario. Suppose we have two bikes, "Bike-1" and "Bike-2". While performing service, a technician notices that the color on the rim of "Bike-2" matches with the frame of "Bike-1" and vice-versa. Perhaps there was a mistake made during the initial assembly process? The technician decides to swap the wheels between the two bikes. The ERP system then shows that "Wheel-1" was removed from "Bike-1" on the service date and that "Wheel-1" was installed in "Bike-2" on the same date (similarly for "Wheel-2"). To reduce clutter below, we'll ignore Frames and Tubes. + +**`erp_components`:** + +| `assembly_id` | `component_id` | `installed_at` | `removed_at` | +| - | - | - | - | +| | Bike-1 | 2023-01-01 | | +| Bike-1 | Wheel-1 | 2023-01-01 | 2023-06-01 | +| Wheel-1 | Rim-1 | 2023-01-01 | | +| Wheel-1 | Tire-1 | 2023-01-01 | | +| | Bike-2 | 2023-02-01 | | +| Bike-2 | Wheel-2 | 2023-02-01 | 2023-06-01 | +| Wheel-2 | Rim-2 | 2023-02-01 | | +| Wheel-2 | Tire-2 | 2023-02-01 | | +| Bike-2 | Wheel-1 | 2023-06-01 | | +| Bike-1 | Wheel-2 | 2023-06-01 | | + +When this ERP data gets converted into the multivalued dimension, we get the table below. In the ERP data, only one kind of component assembly, the wheel, was removed/installed, but in the dimensional model all of the child components come along for the ride. In the table below, we see that "Bike-1" and "Bike-2" each have two distinct ranges of valid time, one prior to the wheel swap, and one after. + +**`mdim_components`:** + +| `component_sk` | `top_assembly_id` | `assembly_id` | `component_id` | `valid_from_at` | `valid_to_at` | +| - | - | - | - | - | - | +| sk1 | Bike-1 | | Bike-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Bike-1 | Wheel-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Wheel-1 | Rim-1 | 2023-01-01 | 2023-06-01 | +| sk1 | Bike-1 | Wheel-1 | Tire-1 | 2023-01-01 | 2023-06-01 | +| sk2 | Bike-1 | | Bike-1 | 2023-06-01 | | +| sk2 | Bike-1 | Bike-1 | Wheel-2 | 2023-06-01 | | +| sk2 | Bike-1 | Wheel-2 | Rim-2 | 2023-06-01 | | +| sk2 | Bike-1 | Wheel-2 | Tire-2 | 2023-06-01 | | +| sk3 | Bike-2 | | Bike-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Bike-2 | Wheel-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Wheel-2 | Rim-2 | 2023-02-01 | 2023-06-01 | +| sk3 | Bike-2 | Wheel-2 | Tire-2 | 2023-02-01 | 2023-06-01 | +| sk4 | Bike-2 | | Bike-2 | 2023-06-01 | | +| sk4 | Bike-2 | Bike-2 | Wheel-1 | 2023-06-01 | | +| sk4 | Bike-2 | Wheel-1 | Rim-1 | 2023-06-01 | | +| sk4 | Bike-2 | Wheel-1 | Tire-1 | 2023-06-01 | | + +## Summary + +In this article, we've explored a strategy for creating a dimensional model for ragged time-varying hierarchies. We used a simple toy system involving one or two eBikes. In the real world, there would be many more individual products, deeper hierarchies, more component attributes, and the install/removal dates would likely be captured with a timestamp component as well. The model described here works very well even in these messier real world cases. + +If you have any questions or comments, please reach out to me by commenting on this post or contacting me on dbt slack (@Sterling Paramore). diff --git a/website/blog/2023-05-04-generating-dynamic-docs.md b/website/blog/2023-05-04-generating-dynamic-docs.md new file mode 100644 index 00000000000..1e704178b0a --- /dev/null +++ b/website/blog/2023-05-04-generating-dynamic-docs.md @@ -0,0 +1,370 @@ +--- +title: "Accelerate your documentation workflow: Generate docs for whole folders at once" +description: "For columns that are reused across models, Mikael walks through a DRY-method to make documentation easier, using the dbt Codegen package and docs blocks." +slug: generating-dynamic-docs-dbt + +authors: [mikael_thorup] + +tags: [dbt tutorials] +hide_table_of_contents: false + +date: 2023-05-17 +is_featured: true +--- + +At [Lunar](https://www.lunar.app/), most of our dbt models are sourcing from event-driven architecture. As an example, we have the following models for our `activity_based_interest` folder in our ingestion layer: + +- `activity_based_interest_activated.sql` +- `activity_based_interest_deactivated.sql` +- `activity_based_interest_updated.sql` +- `downgrade_interest_level_for_user.sql` +- `set_inactive_interest_rate_after_july_1st_in_bec_for_user.sql` +- `set_inactive_interest_rate_from_july_1st_in_bec_for_user.sql` +- `set_interest_levels_from_june_1st_in_bec_for_user.sql` + +This results in a lot of the same columns (e.g. `account_id`) existing in different models, across different layers. This means I end up: + +1. Writing/copy-pasting the same documentation over and over again +1. Halfway through, realizing I could improve the wording to make it easier to understand, and go back and update the `.yml` files I already did +1. Realizing I made a syntax error in my `.yml` file, so I go back and fix it +1. Realizing the columns are defined differently with different wording being used in other folders in our dbt project +1. Reconsidering my choice of career and pray that a large language model will steal my job +1. Considering if there’s a better way to be generating documentation used across different models + + + +In fact, I found a better way using some CLI commands, the dbt Codegen package and docs blocks. I also made the following meme in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/) channel #memes-and-off-topic-chatter to encapsulate this method: + + + +## What pain is being solved? + +If you need to document the same column multiple times, this method limits manual errors, makes it faster to write and maintain documentation, and improves consistency of documentation. **This documentation method saves me 50-80% of the time I previously spent on documentation, by making the documentation process in dbt more and automated.** + +## What will you learn after reading this article? + +Not only will you learn how to work in an easier way with dbt documentation, but you will also become more familiar with [the dbt Codegen package](https://hub.getdbt.com/dbt-labs/codegen/latest/), docs blocks, regex, and terminal commands. + +:::note +Note that this solution has been tested on Mac/VS Code, and that regex behavior may vary between stacks. +::: + +## Prerequisites + +- Experience writing dbt documentation manually +- Installing dbt, the dbt Codegen package, and VS Code +- A folder in your dbt project which has a lot of undocumented dbt models, where a lot of column names overlap between the models + +## Case study + +In this article, we’ll use a current task of mine, where I mapped the following events related to interest rates: + +``` +models/core/activity_based_interest +├── events +│ ├── activity_based_interest_activated.sql +│ ├── activity_based_interest_deactivated.sql +│ ├── activity_based_interest_updated.sql +│ ├── downgrade_interest_level_for_user.sql +│ ├── set_inactive_interest_rate_after_july_1st_in_bec_for_user.sql +│ ├── set_inactive_interest_rate_from_july_1st_in_bec_for_user.sql +│ └── set_interest_levels_from_june_1st_in_bec_for_user.sql +└── models + └── f_activity_based_interest.sql +``` + +## Generate `.yml` with Codegen package + +The [dbt Codegen package](https://github.com/dbt-labs/dbt-codegen) generates dbt code and logs it to the command line, so you can copy and paste it to use in your dbt project. Rather than writing the content of `.yml` files manually, you can use the `generate_model_yaml` macro, which queries the database to gather table — and column names, and outputs this into a format ready to be copy-pasted into a `.yml` file. + +This macro allows you to run commands like: + +``` +dbt run-operation generate_model_yaml --args '{"model_names": ["your_model_name",], "upstream_descriptions": true}' +``` + +The arguments are, as per Codegen’s documentation: +- `model_names` (required): The model(s) you wish to generate YAML for. +- `upstream_descriptions` (optional, `default=False`): Whether you want to include descriptions for identical column names from upstream models. + +This macro generates the YAML for a list of model(s), which you can then paste into a `schema.yml` file, for instance: + +``` +$ dbt run-operation generate_model_yaml --args '{"model_names": [ "activity_based_interest_activated"] }' +``` + +outputs: +``` +13:09:42 Running with dbt=1.3.1 +13:09:45 version: 2 + +models: + - name: activity_based_interest_activated + description: "" + columns: + - name: id + description: "" + + - name: user_id + description: "" + + - name: start_date + description: "" + + - name: end_date + description: "" + + - name: tier_threshold_amount + description: "" + + - name: tier_interest_percentage + description: "" + + - name: event_time + description: "" + + - name: event_day + description: "" +``` + +Everything from `version: 2` and onwards can be copy-pasted into your `.yml` file, and just like that, you’ve saved a lot of time having to write structure by hand (*and inevitably forgetting a ", a ', or making some random indentation error somewhere…*). + +### Generate `.yml` for several models at once + +For the astute observer, `model_names` accepts several models, which we can take advantage of. Thus, we don’t need to run this tool once per model. Instead, we can run: + +``` +$ dbt run-operation generate_model_yaml --args '{"model_names": [ "activity_based_interest_activated", "activity_based_interest_deactivated", "activity_based_interest_updated", "downgrade_interest_level_for_user", "f_activity_based_interest", "set_inactive_interest_rate_after_july_1st_in_bec_for_user", "set_inactive_interest_rate_from_july_1st_in_bec_for_user", "set_interest_levels_from_june_1st_in_bec_for_user"] }' +``` + +This returns a single `.yml` file, containing documentation for all of the models, similarly to above. Here’s a subset of the result set: + +``` +13:16:21 Running with dbt=1.3.1 +13:16:27 version: 2 + +models: + - name: activity_based_interest_activated + description: "" + columns: + - name: id + description: "" + + - name: user_id + description: "" + +... (truncated for example purposes) + + - name: set_inactive_interest_rate_after_july_1st_in_bec_for_user + description: "" + columns: + - name: id + description: "" + + - name: user_id + description: "" + + - name: start_date + description: "" + + - name: event_time + description: "" + + - name: event_day + description: "" + + - name: set_inactive_interest_rate_from_july_1st_in_bec_for_user + description: "" + columns: + - name: id + description: "" + + - name: user_id + description: "" + + - name: event_time + description: "" + + - name: event_day + description: "" +``` + +### Get model names programmatically + +In order to not have to manually write all of the model names, we can programmatically gather names of relevant models: + +``` +$ dbt ls -m models/core/activity_based_interest --output name | xargs -I{} echo -n ' "{}",' + "activity_based_interest_activated", "activity_based_interest_deactivated", "activity_based_interest_updated", "downgrade_interest_level_for_user", "f_activity_based_interest", "set_inactive_interest_rate_after_july_1st_in_bec_for_user", "set_inactive_interest_rate_from_july_1st_in_bec_for_user", "set_interest_levels_from_june_1st_in_bec_for_user",% + ``` + +1. `dbt ls -m models/core/activity_based_interest`: This command lists all dbt models in the models/core/activity_based_interest directory. +1. `--output name`: This option filters the output to only show the name of each model, rather than the context + model name. +1. `| xargs -I{} echo -n ' "{}",'`: This pipe sends the output of the previous command to `xargs`, which runs the echo command on each line of output. + - `-I{}` specifies that `{}` should be replaced with the model name + - The `echo` command then formats the model name by wrapping it in double quotes and appending a comma and a space: `"model", "name",` + - The `-n` option for `echo` removes the trailing newline + +The output (⚠️ except the last two characters `,%` ) can then be copy-pasted into the following: + +``` +dbt run-operation generate_model_yaml --args '{"model_names": [ReplaceWithYourOutputFromPreviousCommand]}' +``` + +Which in turn can be copy-pasted into a new `.yml` file. In our example, we write it to `_activity_based_interest.yml`. + +## Create docs blocks for the new columns + +[Docs blocks](https://docs.getdbt.com/docs/collaborate/documentation#using-docs-blocks) can be utilized to write more DRY and robust documentation. To use docs blocks, update your folder structure to contain a `.md` file. Your file structure should now look like this: + +``` +models/core/activity_based_interest +├── _activity_based_interest_docs.md --New docs block markdown file +├── _activity_based_interest_docs.yml +├── events +│ ├── activity_based_interest_activated.sql +│ ├── activity_based_interest_deactivated.sql +│ ├── activity_based_interest_updated.sql +│ ├── downgrade_interest_level_for_user.sql +│ ├── set_inactive_interest_rate_after_july_1st_in_bec_for_user.sql +│ ├── set_inactive_interest_rate_from_july_1st_in_bec_for_user.sql +│ └── set_interest_levels_from_june_1st_in_bec_for_user.sql +└── models + └── f_activity_based_interest.sql +``` + +``` +$ cat models/core/activity_based_interest/_activity_based_interest_docs.md +{% docs activity_based_interest__id %} + +Primary key of the table. See sql for key definition. + +{% enddocs %} + +{% docs activity_based_interest__user_id %} + +The internal company id for a given user. + +{% enddocs %} +``` + +``` +$ cat models/core/activity_based_interest/_activity_based_interest_docs.yml +version: 2 + +models: + - name: activity_based_interest_activated + description: "" + columns: + - name: id + description: "{{ doc('activity_based_interest__id') }}" + + - name: user_id + description: "{{ doc('activity_based_interest__user_id') }}" + +... (truncated for example purposes) + + - name: set_inactive_interest_rate_after_july_1st_in_bec_for_user + description: "" + columns: + - name: id + description: "{{ doc('activity_based_interest__id') }}" + + - name: user_id + description: "{{ doc('activity_based_interest__user_id') }}" +``` + +To confirm the formatting works, run the following command to get dbt Docs up and running: + +``` +$ dbt docs && dbt docs serve +``` + + +Here, you can confirm that the column descriptions using the doc blocks are working as intended. + + +### Get all unique columns within the folder + +To cut down on copy-pasting between your markdown and YAML files, find all of the unique columns in the folder and subfolders, by running the following command: + +``` +$ grep ' \- name:' models/core/activity_based_interest/_activity_based_interest_docs.yml | cut -c 15- | sort -u +end_date +event_day +event_time +id +is_active +last_updated_date +start_date +tier_interest_percentage +tier_threshold_amount +user_id +``` + +Breaking down this command: +- `grep ' \- name:' models/core/activity_based_interest/_activity_based_interest_docs.yml` searches for the pattern ` - name:` in the file `_activity_based_interest_docs.yml` located in the directory `models/core/activity_based_interest/`. +- `cut -c 15-` cuts the first 14 characters of each line from the output, i.e. in .yml files, we cut ` - name: ` from ` - name: some_column_name`, so you are left with only `some_column_name`. +- `sort -u` sorts the output in alphabetical order and removes any duplicate lines. + +### Format to align with Jinja docs block + +Copy-paste the above output into your `.md` file, so it looks like the following: + +``` +$ cat models/core/activity_based_interest/_activity_based_interest_docs.md +end_date +event_day +event_time +id +is_active +last_updated_date +start_date +tier_interest_percentage +tier_threshold_amount +user_id +``` +Now, open your code editor, and replace `(.*)` with `{% docs column__activity_based_interest__$1 %}\n\n{% enddocs %}\n`, which will result in the following in your markdown file: + + + +Now you can add documentation to each of your columns. + +## Update `.yml` file to source documentation from the `.md` file + +You can programmatically identify all columns, and have them point towards the newly-created documentation. In your code editor, replace `\s{6}- name: (.*)\n description: ""` with ` - name: $1\n description: "{{ doc('column__activity_based_interest__$1') }}`: + + + +⚠️ Some of your columns may already be available in existing docs blocks. In this example, the following replacements are done: +- `{{ doc('column__activity_based_interest__user_id') }}` → `{{ doc("column_user_id") }}` +- `{{ doc('column__activity_based_interest__event_day') }}` → `{{ doc("column_event_day") }}` + +## Check that everything works +Run `dbt docs generate`. If there are syntax errors, this will be found out at this stage. If successful, we can run `dbt docs serve` to perform a smoke test and ensure everything looks right: + + + +## Additional considerations + +- Q: What about when they are slight deviations in column documentation between models? + - A: I’ve been using dynamic documentation to contain the "essence" of the documentation, and then appending static documentation, like so: + + ```yaml + - name: user_id + description: "{{ doc('dynamic_docs') }}, additional static info" + ``` +- Q: Should I use this approach on modifications to an existing folder? + - A: When adding additional models to a folder, or additional columns to an existing model, I would suggest adding new documentation and docs blocks manually rather than programmatically. + +- Q: Couldn’t this be made into a shell script? + - A: Yes! The solution above works well enough for me, but if you make a script, do let me know, as that would make this even easier to use. + + + + + + + + + diff --git a/website/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt.md b/website/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt.md new file mode 100644 index 00000000000..a8b0e1f9f8c --- /dev/null +++ b/website/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt.md @@ -0,0 +1,411 @@ +--- +title: "Building a historical user segmentation model with dbt" +description: "Learn how to use dbt to build custom user segments and track them over time." +slug: historical-user-segmentation + +authors: [santiago_jauregui] + +tags: [analytics craft, dbt tutorials, sql magic] +hide_table_of_contents: false + +date: 2023-06-13 +is_featured: true +--- + +## Introduction + +Most data modeling approaches for customer segmentation are based on a wide table with user attributes. This table only stores the current attributes for each user, and is then loaded into the various SaaS platforms via Reverse ETL tools. + +Take for example a Customer Experience (CX) team that uses Salesforce as a CRM. The users will create tickets to ask for assistance, and the CX team will start attending them in the order that they are created. This is a good first approach, but not a data driven one. + +An improvement to this would be to prioritize the tickets based on the customer segment, answering our most valuable customers first. An Analytics Engineer can build a segmentation to identify the power users (for example with an RFM approach) and store it in the data warehouse. The Data Engineering team can then export that user attribute to the CRM, allowing the customer experience team to build rules on top of it. + + + + +## Problems + +This is a pretty common approach that helps analytics engineering teams to add value to the company outside of just building models that impact reports or dashboards. The main issue here is that we often build models that only show us the latest status of each user, which brings the following challenges. + +### Validating the improvement + +Let’s say that you were able to build the segmentation and export it to the CRM. The customer experience team is now prioritizing the tickets based on the value added by your client. But how can you validate if this initiative actually worked? + +- If you are running a retention campaign and you are prioritizing your “Champions”, are you able to check if they are still “Champions” a month after you contacted them? With the model proposed before, you can’t verify if a Champion is still a champion because you only keep the customer’s last status. +- If you are running an activation campaign and you are prioritizing your “New Users”, you are also unable to check if they became “Champions” or if they are “Hibernating” a month later. + +### Code redundancy with data scientists + +It might also be the case that you have a data science or machine learning (ML) team in your company. The ML practitioners often use user attributes as an input to train their models (also called features in a data science context). In order for that attribute to be useful as a feature in the ML model, they need to know how it changed over time. + +As a result, data scientists often end up rewriting the same user attributes logic in their modeling language (typically Python). This results in wasted effort and inconsistency in business logic between the machine learning and the analytics engineering models. + +Analytics engineering best practices are oriented to helping the data team reuse the models built by other practitioners. We need to find a way to extend that outside of just the analytics team and impact the data team as a whole. + +## Solution + +The approach to solving this is quite simple; we need to build a model that doesn’t just consider the last value for each user attribute, but instead saves a snapshot of how it changed over time. + +One way to solve it would be to use [dbt snapshots](https://docs.getdbt.com/docs/build/snapshots), but this would only keep the attributes history from the time of our model deployment onwards, losing potentially useful data that existed prior to that point in time. + +A better approach for our use case was to calculate the snapshots in our SQL logic. This snapshot can be calculated in various time windows (monthly, weekly, daily) depending on the type of analysis that you need to perform. + +In this section we’ll show you how to build a basic user segmentation model with RFM that only keeps the current value, and then we will go through the changes in the code to preserve the segmentation history. + +### RFM Segmentation + +The goal of RFM analysis is to segment customers into groups based on how recently they made a purchase (Recency), how frequently they make purchases (Frequency), and how much money they spend (Monetary). + +We are going to use just the Recency and Frequency matrix, and use the Monetary value as an accessory attribute. This is a common approach in companies where the Frequency and the Monetary Value are highly correlated. + + + +### RFM model for current segment + +We will first use a `SELECT *` CTE to load all our payments data. The columns that we will be using for the segmentation are the following: + +- **user_id:** Unique identifier for each user or customer +- **payment_date:** Date of each customer’s payment +- **payment_id:** Unique identifier of each payment +- **payment_amount:** Transacted amount of each payment + +```sql +WITH payments AS( + SELECT * + FROM ref {{'fact_payments'}} +), +``` + +| user_id | payment_date | payment_id | payment_amount | +| --- | --- | --- | --- | +| A | 2022-11-28 14:41:45 | AA | 2588.35 | +| B | 2022-11-28 14:42:37 | BB | 10104.99 | +| C | 2022-11-28 14:42:51 | CC | 2588.35 | +| D | 2022-11-28 14:43:42 | DD | 580.5 | +| E | 2022-11-28 14:44:44 | EE | 462.36 | + + +Next we will calculate the RFM (recency, frequency and monetary value) for each user: + +- **max_payment_date:** Last payment date of each user. We keep it for auditing +- **recency:** Days that passed between the last transaction of each user and today +- **frequency:** Quantity of user transactions in the analyzed window +- **monetary:** Transacted amount by the user in the analyzed window + +```sql +rfm_values AS ( + SELECT user_id, + MAX(payment_date) AS max_payment_date, + NOW() - MAX(payment_date) AS recency, + COUNT(DISTINCT payment_id) AS frequency, + SUM(payment_amount) AS monetary + FROM payments + GROUP BY user_id +), +``` + +| user_id | max_payment_date | recency | frequency | monetary | +| --- | --- | --- | --- | --- | +| A | 2023-04-20 10:22:39 | 4 18:20:22.034 | 4 | 83686.65 | +| B | 2023-04-20 10:56:15 | 4 17:46:46.034 | 13 | 53196.06 | +| C | 2023-04-24 13:19:18 | 0 15:23:43.034 | 22 | 56422.6 | +| D | 2023-04-19 19:00:24 | 5 09:42:37.034 | 4 | 2911.16 | +| E | 2023-03-23 19:22:00 | 32 09:21:01.034 | 40 | 30595.15 | + +There are various approaches to dividing users based on their RFM values. In this model we use percentiles to divide customers into groups based on their relative ranking in each of the three metrics, using the `PERCENT_RANK()` function. + +```sql +rfm_percentiles AS ( + SELECT user_id, + recency, + frequency, + monetary, + PERCENT_RANK() OVER (ORDER BY recency DESC) AS recency_percentile, + PERCENT_RANK() OVER (ORDER BY frequency ASC) AS frequency_percentile, + PERCENT_RANK() OVER (ORDER BY monetary ASC) AS monetary_percentile + FROM rfm_values +), +``` + +| user_id | recency | frequency | monetary | recency_percentile | frequency_percentile | monetary_percentile | +| --- | --- | --- | --- | --- | --- | --- | +| A | 44 22:06:59.615 | 8 | 960.01 | 0.65 | 0.75 | 0.5 | +| B | 421 15:21:49.829 | 13 | 2348.49 | 0.09 | 0.84 | 0.78 | +| C | 1 15:04:48.922 | 7 | 3532.08 | 0.97 | 0.71 | 0.81 | +| D | 4 21:16:33.112 | 4 | 490.14 | 0.91 | 0.56 | 0.34 | +| E | 2 08:08:22.921 | 14 | 7239.69 | 0.95 | 0.85 | 0.28 | + +Now that we have the percentiles of each RFM value of each user, we can assign them a score based on were they end up on the distribution, going by steps of 0.2 or 20% each: + +- **recency_score:** Recency percentile values grouped from 1 to 5 +- **frequency_score:** Frequency percentile values grouped from 1 to 5 +- **monetary_score:** Monetary percentile values grouped from 1 to 5 + +```sql +rfm_scores AS( + SELECT *, + CASE + WHEN recency_percentile >= 0.8 THEN 5 + WHEN recency_percentile >= 0.6 THEN 4 + WHEN recency_percentile >= 0.4 THEN 3 + WHEN recency_percentile >= 0.2 THEN 2 + ELSE 1 + END AS recency_score, + CASE + WHEN frequency_percentile >= 0.8 THEN 5 + WHEN frequency_percentile >= 0.6 THEN 4 + WHEN frequency_percentile >= 0.4 THEN 3 + WHEN frequency_percentile >= 0.2 THEN 2 + ELSE 1 + END AS frequency_score, + CASE + WHEN monetary_percentile >= 0.8 THEN 5 + WHEN monetary_percentile >= 0.6 THEN 4 + WHEN monetary_percentile >= 0.4 THEN 3 + WHEN monetary_percentile >= 0.2 THEN 2 + ELSE 1 + END AS monetary_score + FROM rfm_percentiles +), +``` + +| user_id | recency_percentile | frequency_percentile | monetary_percentile | recency_score | frequency_score | monetary_score | +| --- | --- | --- | --- | --- | --- | --- | +| A | 0.26 | 0.3 | 0.12 | 2 | 2 | 1 | +| B | 0.94 | 0.38 | 0.23 | 5 | 2 | 2 | +| C | 0.85 | 0.96 | 0.87 | 5 | 5 | 5 | +| D | 0.71 | 0.63 | 0.93 | 4 | 4 | 5 | +| E | 0.67 | 0.51 lo | 0.76 | 4 | 3 | 5 | + +Lastly, we can segment the users by their frequency and recency scores based on the proposed R-F matrix: + +- **rfm_segment:** Segment of each user based on a mapping of the recency and frequency scores. + +```sql + +rfm_segment AS( +SELECT *, + CASE + WHEN recency_score <= 2 + AND frequency_score <= 2 THEN 'Hibernating' + WHEN recency_score <= 2 + AND frequency_score <= 4 THEN 'At Risk' + WHEN recency_score <= 2 + AND frequency_score <= 5 THEN 'Cannot Lose Them' + WHEN recency_score <= 3 + AND frequency_score <= 2 THEN 'About to Sleep' + WHEN recency_score <= 3 + AND frequency_score <= 3 THEN 'Need Attention' + WHEN recency_score <= 4 + AND frequency_score <= 1 THEN 'Promising' + WHEN recency_score <= 4 + AND frequency_score <= 3 THEN 'Potential Loyalists' + WHEN recency_score <= 4 + AND frequency_score <= 5 THEN 'Loyal Customers' + WHEN recency_score <= 5 + AND frequency_score <= 1 THEN 'New Customers' + WHEN recency_score <= 5 + AND frequency_score <= 3 THEN 'Potential Loyalists' + ELSE 'Champions' + END AS rfm_segment +FROM rfm_scores +) +SELECT * +FROM rfm_segment +``` + +| user_id | recency_score | frequency_score | monetary_score | rfm_segment | +| --- | --- | --- | --- | --- | +| A | 4 | 3 | 5 | Potential Loyalists | +| B | 4 | 5 | 5 | Loyal Customers | +| C | 5 | 4 | 5 | Champions | +| D | 1 | 5 | 5 | Cannot Lose Them | +| E | 1 | 4 | 5 | At Risk | + +### RFM model with segmentation history + +This next example shows how you can build a model with a snapshot of the user attributes at the end of each month. The same could be built for a weekly model with minor adjustments. + +```sql +WITH payments AS( + SELECT * + FROM ref {{'fact_payments'}} +), +months AS( + SELECT NOW() AS date_month + UNION ALL + SELECT DISTINCT date_month AS date_month + FROM ref {{'dim_calendar'}} +), +payments_with_months AS( + SELECT user_id, + date_month, + payment_date, + payment_id, + payment_amount + FROM months + JOIN payments ON payment_date <= date_month +), +rfm_values AS ( + SELECT user_id, + date_month, + MAX(payment_date) AS max_payment_date, + date_month - MAX(payment_date) AS recency, + COUNT(DISTINCT payment_id) AS frequency, + SUM(payment_amount) AS monetary + FROM payments_with_months + GROUP BY user_id, date_month +), +rfm_percentiles AS ( + SELECT user_id, + date_month, + recency, + frequency, + monetary, + PERCENT_RANK() OVER (ORDER BY recency DESC) AS recency_percentile, + PERCENT_RANK() OVER (ORDER BY frequency ASC) AS frequency_percentile, + PERCENT_RANK() OVER (ORDER BY monetary ASC) AS monetary_percentile + FROM rfm_values +), +rfm_scores AS( + SELECT *, + CASE + WHEN recency_percentile >= 0.8 THEN 5 + WHEN recency_percentile >= 0.6 THEN 4 + WHEN recency_percentile >= 0.4 THEN 3 + WHEN recency_percentile >= 0.2 THEN 2 + ELSE 1 + END AS recency_score, + CASE + WHEN frequency_percentile >= 0.8 THEN 5 + WHEN frequency_percentile >= 0.6 THEN 4 + WHEN frequency_percentile >= 0.4 THEN 3 + WHEN frequency_percentile >= 0.2 THEN 2 + ELSE 1 + END AS frequency_score, + CASE + WHEN monetary_percentile >= 0.8 THEN 5 + WHEN monetary_percentile >= 0.6 THEN 4 + WHEN monetary_percentile >= 0.4 THEN 3 + WHEN monetary_percentile >= 0.2 THEN 2 + ELSE 1 + END AS monetary_score + FROM rfm_percentiles +), +rfm_segment AS( +SELECT *, + CASE + WHEN recency_score <= 2 + AND frequency_score <= 2 THEN 'Hibernating' + WHEN recency_score <= 2 + AND frequency_score <= 4 THEN 'At Risk' + WHEN recency_score <= 2 + AND frequency_score <= 5 THEN 'Cannot Lose Them' + WHEN recency_score <= 3 + AND frequency_score <= 2 THEN 'About to Sleep' + WHEN recency_score <= 3 + AND frequency_score <= 3 THEN 'Need Attention' + WHEN recency_score <= 4 + AND frequency_score <= 1 THEN 'Promising' + WHEN recency_score <= 4 + AND frequency_score <= 3 THEN 'Potential Loyalists' + WHEN recency_score <= 4 + AND frequency_score <= 5 THEN 'Loyal Customers' + WHEN recency_score <= 5 + AND frequency_score <= 1 THEN 'New Customers' + WHEN recency_score <= 5 + AND frequency_score <= 3 THEN 'Potential Loyalists' + ELSE 'Champions' + END AS rfm_segment +FROM rfm_scores +) +SELECT * +FROM rfm_segment +``` + +The original query uses the current date (obtained by using the `NOW()` function) to calculate the recency of each user, whereas the new approach includes 2 CTEs that allow the RFM scores to be calculated on a monthly basis. + +- The first CTE queries a calendar table and selects the `date_month` column. It also appends a row with the `NOW()` function to calculate the attributes for the current month. + +```sql +months AS( + SELECT NOW() AS date_month + UNION ALL + SELECT DISTINCT date_month AS date_month + FROM ref {{'dim_calendar'}} +), +``` + +| date_month | +| --- | +| 2023-04-25 5:51:09 | +| 2023-04-01 0:00:00 | +| 2023-03-01 0:00:00 | +| 2023-02-01 0:00:00 | +| 2023-01-01 0:00:00 | +- The second CTE has a `LEFT JOIN` that keeps the list of payments the user had until the end of each month, which allows the model to calculate the RFM segment the user had at the end of each period. +- The recency metric is calculated to the end of each month. If the month is not yet finished, we calculate it to the current day (thanks to the `UNION` in the first query). + +```sql +payments_with_months AS( + SELECT user_id, + date_month, + payment_date, + payment_id, + payment_amount + FROM months + JOIN payments ON payment_date <= date_month +), +``` + +| user_id | date_month | payment_date | payment_id | amount | +| --- | --- | --- | --- | --- | +| A | 2023-04-25 5:55:05 | 2022-04-16 19:41:05 | BB | 120 | +| A | 2023-04-25 5:55:05 | 2023-03-23 18:17:46 | AA | 160 | +| A | 2023-04-01 0:00:00 | 2023-03-23 18:17:46 | AA | 160 | +| B | 2023-04-25 5:55:05 | 2022-08-23 17:52:44 | CC | 90 | +| B | 2023-04-01 0:00:00 | 2022-08-23 17:52:44 | CC | 90 | +| E | 2023-04-25 5:55:05 | 2023-02-05 12:17:19 | EE | 10630 | +| E | 2023-04-01 0:00:00 | 2023-02-05 12:17:19 | EE | 10630 | + +### Getting the lastest status + +Once we have our historical model built, we can add another model that runs after it in our dependency graph. This can help reduce the latency in use cases where querying the whole history is not needed (like personalization initiatives). + +```sql +WITH rfm_segments AS( + SELECT * + FROM ref {{'model_rfm_segments_hist'}} +), +current_segments AS( + SELECT * + FROM rfm_segments + WHERE date_month = (SELECT MAX(date_month) FROM rfm_segments) +) +SELECT * +FROM current_segments +``` + +### Solution overview + +With the new approach, our dependency graph would look like this: + + + +- For analysts that want to see how the segments changed over time, they can query the historical model. There is also an option to build an aggregated model before loading it in a Business Intelligence tool. +- For ML model training, data scientists and machine learning practitioners can import this model into their notebooks or their feature store, instead of rebuilding the attributes from scratch. +- If you want to personalize the experience of a user based on their segment, like in the CX example from the beginning, you can query the current segmention and export it to your CRM with a Reverse ETL tool. + +## Conclusions + +This design has trade-offs, notably longer build-time and harder explainability. However, we believe that data teams that invest in this approach will get better datasets for historical analysis, more collaboration with data scientists, and overall greater impact from their analytics engineering efforts. + +## Related resources + +[Operational Analytics in Practice](https://www.getdbt.com/analytics-engineering/use-cases/operational-analytics/) + +[How dbt Labs' data team approaches reverse ETL](https://www.getdbt.com/open-source-data-culture/reverse-etl-playbook/) + +[The Operational Data Warehouse: Reverse ETL, CDPs, and the future of data activation](https://www.getdbt.com/coalesce-2021/operational-data-warehouse-reverse-etl-cdp-data-activation/) + + diff --git a/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md new file mode 100644 index 00000000000..2a4879ac98d --- /dev/null +++ b/website/blog/2023-07-03-data-vault-2-0-with-dbt-cloud.md @@ -0,0 +1,152 @@ +--- +title: "Data Vault 2.0 with dbt Cloud" +description: "When to use, and when not to use Data Vault 2.0 data modeling, and why dbt Cloud is a great choice" +slug: data-vault-with-dbt-cloud + +authors: [rastislav_zdechovan, sean_mcintyre] + +tags: [analytics craft, data ecosystem] +hide_table_of_contents: true + +date: 2023-07-03 +is_featured: true +--- + +Data Vault 2.0 is a data modeling technique designed to help scale large data warehousing projects. It is a rigid, prescriptive system detailed vigorously in [a book](https://www.amazon.com/Building-Scalable-Data-Warehouse-Vault/dp/0128025107) that has become the bible for this technique. + +So why Data Vault? Have you experienced a data warehousing project with 50+ data sources, with 25+ data developers working on the same data platform, or data spanning 5+ years with two or more generations of source systems? If not, it might be hard to initially understand the benefits of Data Vault, and maybe [Kimball modelling](https://docs.getdbt.com/blog/kimball-dimensional-model) is better for you. But if you are in _any_ of the situations listed, then this is the article for you! + + + +Here’s an analogy to help illustrate Data Vault: + +Think of a city’s water supply. Each house does not have a pipe directly from the local river: there is a dam and a reservoir to collect water for the city from all of the sources – the lakes, streams, creeks, and glaciers – before the water is redirected into each neighborhood and finally into each home’s taps. + +A new development in the city? No problem! Just hook up the new pipes to the reservoir! Not enough water? Just find another water source and fill up the reservoir. + +Data Vault is the dam and reservoir: it is the well-engineered data model to structure an organization’s data from source systems for use by downstream data projects – rather than each team collecting data straight from the source. The Data Vault data model is designed using a few well-applied principles, and in practice, pools source data so it is available for use by all downstream consumers. This promotes a scalable data warehouse through reusability and modularity. + + + +## Data Vault components + +Loading your data directly from source systems without applying any business rules implies that you want them stored in a so-called **Raw Vault**. This is most of the time the first step in the journey of transforming your data. There are situations where you’d want to apply business logic before loading the data into your presentation layer, that’s where **Business Vault **comes into play. Performance enhancement or centralized business logic are a few of the reasons for doing so. + +The core components of Data Vault are hubs, links, and satellites. They allow for more flexibility and extensibility and can be used to model complex processes in an agile way. + +Here is what you need to know about the main components: + +* **Hubs**: A hub is the central repository of all business keys identifying the same business entity. By separating data into hubs, we ensure each piece of business concept is as accurate and consistent as possible while avoiding redundancy and ensuring referential integrity; +* **Links**: Links connect your hubs in Data Vault. The relationship is stored as data, which makes it auditable and flexible to change. There are several special types of links, but in most cases, links are bidirectional, meaning you can easily navigate back and forth between business entities. This allows you to analyze complex relationships via connections created by hubs and links in your data model; +* **Satellites**: Satellites store contextual, descriptive, and historical information about the hubs and links they are attached to, depending on whether the data is related to a business object or a relationship. Each satellite in the Data Vault provides additional, valuable information about the main entity. + +You can think of these Raw Vault components as LEGO bricks: they are modular and you can combine them in many different ways to build a wide variety of different, cohesive structures. + +Given its modular structure that requires many joins to get the specific information, Data Vault is not intended as a final, presentation layer in your data warehouse. Instead, due to the wide variety of use cases, the framework works brilliantly as the middle, integration layer of your business, serving any form of presentation layer you might have, such as wide tables, star schema, feature stores, you name it. + +To further accelerate the creation of these layers and prevent the repetition of the same business logic, you can make use of Business Vault as a complementary layer of your data warehouse. + +The Business Vault is there to fill the gaps of generic, source-data-generated Raw Vault, which often does not cover all of the business processes of your organization. You can easily address such challenges by applying soft rules applied in this. + +Business Vault can also help with performance issues that can arise due to presentation layer transformations having to do lots of joins on the fly. In such scenarios, a business vault becomes a central piece of your business logic populating all of the information marts. + +### Should you consider Data Vault for your data warehouse? + +Data Vault is a powerful modelling technique for middle-to-enterprise level data warehouses with the following attributes: + +* Integration of multiple dynamic source systems; +* Long-term project with agile delivery requirements; +* Auditibilty and compliance needs; +* Preference for template based project allowing automation needs; +* High flexibility of the data model with minimal reengineering; +* Load performance is important, parallel loading is a must. + +Due to its complexity, Data Vault is not a go-to choice for: + +* Simple and constant systems; +* Quick one-off solutions for experiments or short-term data warehouse projects; +* Data warehouse layers needed for direct reporting. + +## dbt Cloud: the operating system for Data Vault + +There are many tools that can be used to implement your Data Vault project but dbt Cloud with its rich set of features provides the functionalities that make the difference by accelerating your project end to end, saving you the trouble of jumping from one tool to another. + +Let’s take a look at the most impactful features and explore how you can leverage them when implementing your Data Vault project. + +### Scalable schema + +Don’t Repeat Yourself (DRY) software engineering principles can help you sleep better when you are dealing with complex projects, which Data Vault most often is. + +dbt's [**macros**](https://docs.getdbt.com/docs/build/jinja-macros) feature is a lifesaver in terms of templating your code. It saves you headaches due to manual errors as well as defining transformation logic in one place in case you need to change it. + +Data Vault follows the insert-only principle with incremental loading strategy. A built-in [**Jinja**](https://docs.getdbt.com/docs/build/jinja-macros) functionality allows you to create one version of the dbt model for both incremental and full load of a table. The easy dependency management that this feature helps you achieve is a huge benefit for highly complex projects. + +If you are new to the framework, taking a look at already built Data Vault macros can be crucial, and even if you are an expert, it can still be beneficial. dbt’s rich set of community [**packages**](https://docs.getdbt.com/docs/build/packages) can be directly applied to your project or used as an inspiration for your own transformation templates. + +Building your transformation templates leveraging reusable macros and flexible Jinja language can enhance your project development in a scalable way. When things get more complex, you are able to go back and change your templates in one place either completely, or using parameters to ensure you don’t mess with what already works well. + +If you are someone who has practiced Data Vault data modeling in another tool, you might appreciate the dbt [**model contracts**](https://docs.getdbt.com/docs/collaborate/govern/model-contracts) as a way to guarantee to your data end-users the exact shape of a dbt transformation. This is a similar practice to writing DDL. + +Scalability also happens at the database layer. With [**materializations**](https://docs.getdbt.com/docs/build/materializations), you have fine-grained control over whether a database object built by dbt is persisted as a view, table, or built incrementally, which gives you control over the performance and cost characteristics of each transformation. So if your data platform bill is growing, it’s easy to identify which Data Vault components are the most expensive and make optimizations to reduce cost. + +With the active dbt open source community, there is a good chance you are facing a problem which was already solved by someone else. There are plenty of amazing packages available in the dbt [package hub](https://hub.getdbt.com/), which you can utilise to speed up your development even further. + +### Agile development + +dbt Cloud includes **built-in Git** with accessible features directly from its IDE, which simplifies development immensely. Once a developer is happy with their additions or changes to the Data Vault codebase, they can commit the code within the IDE and open a Pull Request, triggering a code review process. Then, with [continuous integration with dbt Cloud](https://docs.getdbt.com/docs/deploy/continuous-integration), automated checks are run to ensure data quality standards and Data Vault conventions are met, automatically preventing any bad changes from reaching production. + +The biggest boon to Data Vault developer productivity in dbt Cloud are the **DataOps** and **Data Warehouse Automation** features of dbt Cloud. Each Data Vault developer gets their own development environment to work in and there is no complicated set up process to go through. + +Commit your work, create a pull request, and have automated code review enabled by dbt Cloud [**jobs**](https://docs.getdbt.com/docs/deploy/jobs) that can be defined for each environment separately (e.g., testing, QA, production). Together with dbt [**tags**](https://docs.getdbt.com/reference/resource-configs/tags), the feature allows you to orchestrate your project in an efficient and powerful way. + +### Auditable data + +One of the main selling points of Data Vault is its auditability. In addition to its own capabilities, dbt Cloud features enhance this advantage even further. Each job execution leaves an [**audit log**](https://docs.getdbt.com/docs/cloud/manage-access/audit-log), which can be leveraged to analyze trends in job performance among other things, allowing you to identify bottlenecks in your system. dbt Cloud stores [**artifact**](https://docs.getdbt.com/docs/deploy/artifacts) files after each execution for further processing and analysis as well, and exposes them programmatically via the [Discovery API](https://www.getdbt.com/blog/introducing-the-discovery-api/). + +dbt has the built-in **data lineage **which helps both developers and data consumers understand just how the data assets in the data warehouse are created. And with the self-serve and automatically generated [**dbt docs**](https://docs.getdbt.com/reference/commands/cmd-docs), you can spend less time answering questions about your data from across the organization and more time building your Data Vault. + +Last but not least, the built-in [**dbt testing framework**](https://docs.getdbt.com/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) allows Data Vault developers to test their assumptions about the data in their database. Not only are primary key checks and foreign key checks easy to add and simple to run, but more complex checks like integer range checks, anomaly detection, and highly sophisticated data quality checks are also possible expressed as SQL statements. Infinite Lambda have created two dbt packages for data quality, [dq_tools](https://hub.getdbt.com/infinitelambda/dq_tools/latest/) and [dq_vault](https://hub.getdbt.com/infinitelambda/dq_vault/latest/), which are described later in this post. + +## How to get started with dbt Cloud and Data Vault + +There are many decisions to make before you roll up your sleeves and start implementing your Data Vault data warehouse. Apart from data modelling work, you need to agree on naming conventions, hash algorithm, staging strategy, and data types for standard metadata attributes, and make sure these are all well documented. Here, to save yourself some headaches in the long run, we recommend starting your own **decision log**. + +In terms of the implementation of the Data Vault itself, we recommend familiarizing yourself with the best practices well in advance, especially if you have no previous experience with the framework. There are two well-known dbt packages focusing on Data Vault implementation, which you can take inspiration from to build your own templating system, or there can be used directly if they fit your use case. + +### AutomateDV (formerly known as dbtvault) + +AutomateDV is the most popular open source Data Vault package for dbt, with some users having over 5000 Data Vault components in their project. Here in Infinite Lambda, we’ve been using this package for quite some time now, even building on top of it (depending on the specifics of the project). This mature system provides a great way to start your Data Vault with dbt Cloud journey as the learning curve is quite manageable, it is well documented and even comes with tutorials and working examples built on top of Snowflake’s TPCH standard dataset. There is one limitation to using the package and that is _AutomateDV_ expects your source data to contain only one delta load. In order to work around this issue, owners of the package came up with custom dbt materializations to help you with the initial load of your system, however, the performance of such load is in our experience not acceptable. + +_(Editor's note: As of AutomateDV v0.10.0, this performance issue has been resolved and users may use the standard incremental configuration.)_ + +### datavault4dbt + +At first glance, this fairly new open source package works in a similar fashion, especially since the usage of the macros provides the same experience (apart from the names of some of the parameters). Diving deeper into documentation, however, we can see it provides a higher level of customization thanks to many global variables, which alters the behavior of macros. It also supports any type of source data - CDC, transient or persistent, it can handle it all. We suggest looking into this package if you have a deeper understanding of Data Vault and need a complex, customizable system. It’s good to be aware of the fact that this package is new, so there is a risk of hidden unresolved issues. + +### Customizing the existing packages + +These two packages, AutomateDV, and datavault4dbt, are the most popular approaches to building a Data Vault on dbt. However, sometimes these packages don’t quite match an organization’s pre-existing Data Vault practices built with a different tool. At the surface, dbt looks quite simple, but deep down is extremely customizable: it’s possible to make minor modifications to the packages within your project using Jinja, which is a powerful templating language. + +For example, some organizations choose different hashing algorithms to generate their Data Vault hash keys than what comes out-of-the-box with AutomateDV. So to change that, you can add a [dbt macro](https://docs.getdbt.com/docs/build/jinja-macros#macros) called [default__hash_alg_md5](https://github.com/Datavault-UK/automate-dv/blob/3db7cc285e110ae6976d0afe7a93adf9b776b449/macros/supporting/hash_components/select_hash_alg.sql#L32C1-L36) to your project with the custom logic you want. Much of the package logic can be overridden in this way to suit your needs. + +### Build your own system + +Every project is different and needs its own set of features, special treatments tailored to your data, or performance tuning mechanisms. Because of this, for any long term, high priority data warehouse solutions we at [Infinite Lambda](https://infinitelambda.com/) recommend working on your own templating system. It needs significant engineering effort before an actual implementation (and bug fixing during), but you’ll save time later thanks to knowing where to look for a potential issue. If you are not comfortable creating such a system from scratch, you can always start with one of the above open-source packages and build on them once you hit its limits. + +We at Infinite Lambda treat data quality very seriously and we push for high test coverage as well as overall data governance in every project. With the experience from multiple projects, we developed two data quality dbt packages, which can help business users raise trust in your data. + +Within the [dq_tools](https://hub.getdbt.com/infinitelambda/dq_tools/latest/) _package, we aim for simple storing test results and visualization of these in a BI dashboard. Leveraging this tool can help with making sure your system behaves in an expected way, all in a visual format of dashboard built on your favorite BI tool. [dq_vault](https://hub.getdbt.com/infinitelambda/dq_vault/latest/) package provides an overview of data quality for all Data Vault models in your dbt project. Complex as it is, Data Vault projects need detailed test coverage to make sure there are no holes in the system. This tool helps with governing your testing strategy and being able to identify issues very quickly. + +To help you get started, [we have created a template GitHub project](https://github.com/infinitelambda/dbt-data-vault-template) you can utilize to understand the basic principles of building Data Vault with dbt Cloud using one of the abovementioned packages. But if you need help building your Data Vault, get in touch. + + + +### Entity Relation Diagrams (ERDs) and dbt + +Data lineage is dbt's strength, but sometimes it's not enough to help you to understand the relationships between Data Vault components like a classic ERD would. There are a few open source packages to visualize the entities in your Data Vault built with dbt. I recommend checking out the [dbterd](https://dbterd.datnguyen.de/1.2/index.html) which turns your [dbt relationship data quality checks](https://docs.getdbt.com/docs/build/tests#generic-tests) into an ERD. + +## Summary + +By leveraging the building blocks of Data Vault, organizations can build data warehouses that are adaptable to changing business requirements, promote data quality and integrity, and enable efficient data management and analytics. This in turn drives better decision-making, competitive advantage and business growth. + +Choosing the right methodology for building your data warehouse is crucial for your system’s capabilities in the long run. If you are exploring Data Vault and want to learn more, Infinite Lambda can help you make the right call for your organization. diff --git a/website/blog/2023-07-17-GPT-and-dbt-test.md b/website/blog/2023-07-17-GPT-and-dbt-test.md new file mode 100644 index 00000000000..84f756919a5 --- /dev/null +++ b/website/blog/2023-07-17-GPT-and-dbt-test.md @@ -0,0 +1,213 @@ +--- +title: "Create dbt Documentation and Tests 10x faster with ChatGPT" +description: "You can use ChatGPT to infer the context of verbosely named fields from database table schemas." +slug: create-dbt-documentation-10x-faster-with-ChatGPT + +authors: [pedro_brito_de_sa] + +tags: [analytics craft, data ecosystem] +hide_table_of_contents: true + +date: 2023-07-18 +is_featured: true +--- + +Whether you are creating your pipelines into dbt for the first time or just adding a new model once in a while, **good documentation and testing should always be a priority** for you and your team. Why do we avoid it like the plague then? Because it’s a hassle having to write down each individual field, its description in layman terms and figure out what tests should be performed to ensure the data is fine and dandy. How can we make this process faster and less painful? + +By now, everyone knows the wonders of the GPT models for code generation and pair programming so this shouldn’t come as a surprise. But **ChatGPT really shines** at inferring the context of verbosely named fields from database table schemas. So in this post I am going to help you 10x your documentation and testing speed by using ChatGPT to do most of the leg work for you. + + + +As a one-person Analytics team at [Sage](http://www.hellosage.com/) I had to create our dbt pipelines from the ground up. This meant 30+ tables of internal facts and dimensions + external data into a Staging Layer, plus all of the following layers of augmented models and Mart tables. After the fact, we are talking about 3500+ lines of YAML that I was NOT excited to get started on. Fortunately for me, this was February 2023 and ChatGPT had just come out. And boy, was I glad to have it. After a good dose of “prompt engineering” I managed to get most of my documentation and tests written out, only needing a few extra tweaks. + +Writing this article as of July 2023, and now powered by GPT-4 and not GPT 3.5, it is already easier to get the same results I did, so here are my learnings that I hope everyone can replicate. + +## Use verbose tables with verbose fields + +ChatGPT can only infer so much, so tables with names and fields that resemble encryption keys are unlikely to be good for this approach. In this example we are going to use this table: + +```sql +create or replace TRANSIENT TABLE STAGING.BASE.STG_STAFF_MEMBER ( + ID NUMBER(38,0), + CREATEDATETIME TIMESTAMP_NTZ(9), + UPDATEDATETIME TIMESTAMP_NTZ(9), + VERSION NUMBER(38,0), + FIRSTNAME VARCHAR(16777216), + JOBTITLE VARCHAR(16777216), + LASTNAME VARCHAR(16777216), + MIDDLENAME VARCHAR(16777216), + ISCAREADMIN BOOLEAN, + ISARCHIVED BOOLEAN, + ADDRESSID VARCHAR(16777216), + ENTERPRISEID VARCHAR(16777216), + ISDELETED BOOLEAN +); + +``` + +--- + +I copied this definition from Snowflake, and I would recommend that you always include both the name and type of the fields to get better results. This is especially helpful if you want to make sure ChatGPT correctly distinguishes between text, number, boolean or array fields, which may not be clear from the name alone. + +## Start out with simple prompts + +We all know how ChatGPT can digest very complex prompts, but as this is a tool for you to get the best results for your use case, I would recommend starting with simple, but specific, prompts. + +Opening ChatGPT with GPT4, my first prompt is usually along these lines: + + + +And the output of this simple prompt is nothing short of amazing: + +```yaml +version: 2 + +models: + - name: STAFF_MEMBER + description: This table contains information about the staff members. + columns: + - name: ID + description: The unique identifier for the staff member. + tests: + - unique + - not_null + - name: CREATEDATETIME + description: The timestamp when the record was created. + tests: + - not_null + - name: UPDATEDATETIME + description: The timestamp when the record was last updated. + - name: VERSION + description: Version number of the record. + - name: FIRSTNAME + description: The first name of the staff member. + tests: + - not_null + - name: JOBTITLE + description: The job title of the staff member. + tests: + - not_null + - name: LASTNAME + description: The last name of the staff member. + tests: + - not_null + - name: MIDDLENAME + description: The middle name of the staff member. + - name: ISCARADMIN + description: Boolean value indicating if the staff member is a care administrator. + tests: + - accepted_values: + values: ['true', 'false'] + - name: ISARCHIVED + description: Boolean value indicating if the staff member record is archived. + tests: + - accepted_values: + values: ['true', 'false'] + - name: COMMUNITYID + description: Identifier for the community of the staff member. + - name: ENTERPRISEID + description: Identifier for the enterprise of the staff member. + - name: ISDELETED + description: Boolean value indicating if the staff member record is deleted. + tests: + - accepted_values: + values: ['true', 'false'] +``` + +Back in my day (5 months ago), ChatGPT with GPT 3.5 didn’t have much context on what a dbt-style YAML file was, so the result was a funky, non-dbt YAML format. If you are using GPT 3.5 and the results you get from the first prompt don’t compile in dbt, I would recommend an extra step I took which is to pass your understanding of what a dbt-style YAML file is. You can do this simply by copy-pasting a piece of a publicly available YAML file like [this one from the Jaffle Shop example](https://github.com/dbt-labs/jaffle_shop/blob/main/models/schema.yml), or a piece of YAML from your own code base. + +A prompt for it would look something like: + + + +## Specify details on generic tests in your prompts + +dbt has four built-in generic tests: unique, not_null, accepted_values and relationships. My approach to adding details on these to the prompts is one of ‘more is better’, as in it’s faster to just delete tests that I don’t need in certain fields than having to add them manually. + +ChatGPT will throw unique and not_null tests for mostly all columns that have ‘id’ in their name, and in my experience it adds them to a few more columns than it should, which is usually quick to fix. + +Accepted_values and relationships are slightly trickier but the model can be adjusted to account for these in fairly simple ways. My tips are: + +- accepted_values: add extra detail to the schema like an extra keyword like ‘CATEGORICAL’ after the data type +- relationships: bind the relationships to fields with specific names. For better results, pass on a list of valid table names from your schema that it can refer to. + +One way of doing this is with a prompt like this: + + + +Which results in the following output: + +```yaml +version: 2 + +models: + - name: STAFF_MEMBER + description: This table contains information about the staff members. + columns: + - name: ID + description: The unique identifier for the staff member. + tests: + - unique + - not_null + - name: CREATEDATETIME + description: The timestamp when the record was created. + tests: + - not_null + - name: UPDATEDATETIME + description: The timestamp when the record was last updated. + - name: VERSION + description: Version number of the record. + - name: FIRSTNAME + description: The first name of the staff member. + tests: + - not_null + - name: JOBTITLE + description: The job title of the staff member. This is a categorical field. + tests: + - not_null + - accepted_values: + values: ['Job Title 1', 'Job Title 2', 'Job Title 3'] # replace these with actual job titles + - name: LASTNAME + description: The last name of the staff member. + tests: + - not_null + - name: MIDDLENAME + description: The middle name of the staff member. + - name: ISCARADMIN + description: Boolean value indicating if the staff member is a care administrator. + tests: + - accepted_values: + values: ['true', 'false'] + - name: ISARCHIVED + description: Boolean value indicating if the staff member record is archived. + tests: + - accepted_values: + values: ['true', 'false'] + - name: COMMUNITYID + description: Identifier for the community of the staff member. + tests: + - relationships: + to: STAGING.COMMUNITY.ID # replace with actual reference table + field: ID + - name: ENTERPRISEID + description: Identifier for the enterprise of the staff member. + tests: + - relationships: + to: STAGING.ENTERPRISE.ID # replace with actual reference table + field: ID + - name: ISDELETED + description: Boolean value indicating if the staff member record is deleted. + tests: + - accepted_values: + values: ['true', 'false'] +``` + +Even though it doesn’t magically guess the categories of your variables or your relationships, it is definitely helpful to have some placeholders in the right places. + +As an add-on, giving the model a short description of the data models and the tables you are working with will help it fine tune your results. + +## Wrap-Up + +Creating documentation is still a very manual job, and this approach only works for one table at a time (maybe you can be the one leveraging the OpenAI API and creating a webapp that processes multiple tables at once?). However, ChatGPT can clearly cut a lot of time in these tasks. + +I hope that these simple tips help you be more motivated and efficient in creating documentation and tests for your data models. And remember: verbosity in - verbosity out! diff --git a/website/blog/2023-08-01-announcing-materialized-views.md b/website/blog/2023-08-01-announcing-materialized-views.md new file mode 100644 index 00000000000..3917e3f192c --- /dev/null +++ b/website/blog/2023-08-01-announcing-materialized-views.md @@ -0,0 +1,213 @@ +--- +title: "Optimizing Materialized Views with dbt" +description: "In dbt v1.6, we introduce support for materialized views. In this blog post, Amy will review how to use them in your workflow" +slug: announcing-materialized-views + +authors: [amy_chen] + +tags: [analytics craft, dbt product updates, data ecosystem] +hide_table_of_contents: false + +date: 2023-08-03 +is_featured: true +--- + +## Introduction + +The year was 2020. I was a kitten-only household, and dbt Labs was still Fishtown Analytics. A enterprise customer I was working with, Jetblue, asked me for help running their dbt models every 2 minutes to meet a 5 minute SLA. + +After getting over the initial terror, we talked through the use case and soon realized there was a better option. Together with my team, I created [lambda views](https://discourse.getdbt.com/t/how-to-create-near-real-time-models-with-just-dbt-sql/1457%20?) to meet the need. + +Flash forward to 2023. I’m writing this as my giant dog snores next to me (don’t worry the cats have multiplied as well). Jetblue has outgrown lambda views due to performance constraints (a view can only be so performant) and we are at another milestone in dbt’s journey to support streaming. What. a. time. + +Today we are announcing that we now support Materialized Views in dbt. So, what does that mean? + + + +Materialized views are now an out of the box materialization in your dbt project once you upgrade to the latest version of dbt v1.6 on these following adapters: + +- dbt-postgres +- dbt-redshift +- dbt-snowflake +- dbt-databricks +- dbt-materialize* +- dbt-trino* +- dbt-bigquery** + +*These adapters have supported materialized views in their adapter prior 1.6. +**dbt-bigquery support will be coming in 1.7. + +Just like you would materialize your sql model as  `table` or `view`  today, you can use `materialized_view` in your model configuration, dbt_project.yml, and resources.yml files. At release, python models will not be supported. + + + +For Postgres/Redshift/Databricks + +```sql +{{ +config( + materialized = 'materialized_view', +) +}} + +``` + +For Snowflake: +```sql +{{ +config( + materialized = 'dynamic_table', +) +}} +``` + +:::note +We are only supporting dynamic tables on Snowflake, not Snowflake’s materialized views (for a comparison between Snowflake Dynamic Tables and Materialized Views, refer [docs](https://docs.snowflake.com/en/user-guide/dynamic-tables-comparison#dynamic-tables-compared-to-materialized-views). Dynamic tables are better suited for continuous transformations due to functionality like the ability to join, union, and aggregate on base tables, views , and other dynamic tables. Due to those features, they are also more aligned with what other data platforms are calling Materialized Views. For the sake of simplicity, when I refer to materialized views in this blog, I mean dynamic tables in Snowflake. +::: + +Now that we support materialized views: how do you fit them into your dbt workflow? It’s easy to imagine a world of unregulated computation because you didn’t put in proper guardrails and now you have materialized views running rampant unbeknownst to you in your data platform. + +Materialized views, just like any other materialization, fit a need and you should utilize them while taking into consideration the additional complexity they will add to your project. They are a tool in your analytics engineering toolbox, one of many. + +In this blog, we will go over when to pull this tool out of your toolbox, how to wield it successfully, and how to promote materialized views with governance in mind. Now this is a new functionality and I expect this to be the first of many posts to come, defining our best practices (or even redefining them). Also I will not be discussing dbt’s interactions upstream from the data platform like how to manage your Kafka topics using dbt, but would highly recommend [this post from Charlie Summers](https://docs.getdbt.com/blog/demystifying-event-streams) if that’s something you’re interested in. + +Additionally, if you want to get a more detailed understanding of your data platform’s support of materialized views, I recommend checking out dbt’s and your data platform’s documentation site. This blog post is intended to be a high level, platform agnostic overview to get you started. + +## What are Materialized Views? + +Starting out with, **what are materialized views (MVs)?** While specific features will vary by data platform, materialized views at their core are database objects that have stored the results of a query as a physically materialized table. What makes them distinct from a regular table is that the data in a materialized view is periodically refreshed to reflect the latest changes in the underlying table. Because they’re precomputed and the results are stored, you have faster query times when accessing them because you aren’t recomputing the data from scratch. This is great when you have low latency requirements for your data pipelines. + +Now you might have noticed that MVs sound a lot like incremental models, and you are not wrong! It can be worthwhile to think of materialized views as a successor of sorts to incremental models. In fact, depending on your needs and data platform of choice, you might wish to replace all of your incremental dbt models with materialized view models. By doing this, you will no longer have to manually craft specific incremental strategies, detailing how dbt should update the underlying table. Awesome, right? + +The tradeoff (outside of any data platform specific ones) is that you will have less fine-grained control over the incremental logic and orchestration. This is because you are handing defining the logic of what and how to update the existing table over to the data platform to perform for you. + +Other factors to consider when deciding on when/how to use a materialized view: +- What are the costs associated with running the materialized view versus a batched incremental model? (this will vary depending on your data platform as some will require different compute nodes) +- Does your data platform support joins, aggregations, and window functions on MVs if you need them? +- What are the latency needs of your development environment? In production? (If not near real time, you can make the choice between a batch incremental model or a MV with a longer refresh schedule.) +- How often do your upstream dependencies update? If your answer is `not frequent`, you may not need a MV. +- How large is your dataset?(It might be cheaper to use MVs for extremely large datasets) +- How often do you need your query refreshed? What are your downstream dependencies and their stakeholders? (If near real time is important, MVs might be the right choice). +- Do you have real time machine learning models training or applications using your transformed dataset? + +## Materialized Views in the dbt Workflow + +### Development + +When we talk about using materialized views in development, the question to think about is not so much “should you execute your dbt models as materialized views in your sandbox?,” but rather “should you schedule them to refresh in your sandbox?”. For development, you do need to create them and test them out in your sandbox but how do you do this in a way that doesn’t drive up your cloud bill unnecessarily? Or keeping a post-it note on your laptop as a reminder to drop all of the running materialized views in your sandbox before you sign off? Let’s talk about it! + +Outside of the scheduling part, development will be pretty standard. Your pipeline is likely going to look something like this: + + + +This is assuming you have a near real time pipeline where you are pulling from a streaming data source like a Kafka Topic via an ingestion tool of your choice like Snowpipe for Streaming into your data platform. After your data is in the data platform, you will: + +1. Create the dbt model with the SQL transformation logic that you need. +2. Look at the logic and answer these questions: + 1. Does my data platform support the functionality I need in materialized views? + 2. How often do you need the data refreshed? Do you need any flexibility in that? + 3. How am I promoting this into production? Either you will run the transformation logic in the production environment (recommended) and create a separate object or promote the object created from development. + + +Depending on your answer, this will decide if you want a materialized view in the first place (versus a view, table, or incremental model). If you have decided on a materialized view as meeting your needs, by default do not schedule a refresh. You can run manual refreshes as needed. Why’s that? In your development environment, you are likely validating three things: the dependencies, the SQL logic, and the transformation output. All of those can be tested by creating a materialized view without scheduling and running manually refreshes. + +Your configuration during development: + +For Postgres: + +Every time you run a `dbt run`, that will result in a manual refresh unless you set the `on_configuration_change` to `continue` which will skip running the model. + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', +) +}} +``` + +For Redshift: + +```sql +{{ +config( + materialized = 'materialized_view', + on_configuration_change = 'apply', + auto_refresh = False +) +}} +``` + +For Databricks: + +```sql +{{ +config( + materialized='materialized_view', +) +}} +``` + +By default, materialized views are not refreshed on a schedule on Databricks in this materialization. To set up scheduling, you can use a post-hook to alter the MV with a cron schedule that will run in Databricks Workflows. That could look like something like this + +```sql +post_hook = 'ALTER MATERIALIZED VIEW {{this}} ADD SCHEDULE CRON "0 0 0 * * ? *" AT TIME ZONE "America/Los_Angeles";' +``` + +For Snowflake: + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = '', + target_lag = '', + on_configuration_change = 'apply', +) +}} +``` + +Now if you do need to more fully build out your development pipeline (making sure scheduling/syncs do happen), you can schedule but make sure to drop the materialized views when you are done with them. I encourage you to invest in an operations macro that drops all MVs in the schema that you use as your sandbox and run it as needed. You could even create a dbt Cloud job to manage that. This way, you don’t have any stray MVs running in your sandbox, consuming credits unnecessarily. + +### Testing + +Now let’s dive into the second question: how do you test? In development and QA, this will look the same as any batch run tests. You can run `dbt build` or  `dbt test` and then have the tests run after execution as validation. But in production, what can you do to continually test? Your options are: + +- Continue to do batch testing as we wait for [materialized tests](https://github.com/dbt-labs/dbt-core/issues/6914) +- Or overriding the –store-failures macro like what Materialize has created [here](https://materialize.com/blog/real-time-data-quality-tests-using-dbt-and-materialize/) for their adapter to materialize failed rows as a materialized view. This is not a great solution for the long term but if you have urgency to put this into production, it is an option. + +In order to promote materialized views into production, the process will look very much like it did with your incremental models. Using SlimCI, for new MVs, you can build them into your QA environment. For existing MVs without changes, we can skip and defer to the production objects. + +### Production + +When you feel satisfied with your development and testing, for data platforms that offer scheduling via our dbt configurations, you have two options: hardcode the refresh cadence or write in conditional logic based on the environment for the refresh cadence. I recommend using the latter. + +The code for having a conditional in your config block looks like this if you want to include in a macro for either the lag or other fields (snowflake_warehouse, auto_refresh,etc): + +```sql +{% macro target_lag_environment() %} +{% set lag = '1 minute' if target.name == "prod" else '35 days' %} +{{ return(lag) }} +{% endmacro %} +``` + +```sql +{{ +config( + materialized = 'dynamic_table', + snowflake_warehouse = 'transforming', + target_lag = target_lag_environment(), + on_configuration_change = 'apply', +) +}} +``` + +You will want a very long lag for development; I recommend the cadence you drop and refresh your development environment. Here I just chose my two favorite numbers. + +For orchestration, if your materialized views aren’t able to auto refresh, you can use dbt cloud to orchestrate your refreshes. The beauty of materialized views is that dbt will be able to provide the dependency/testing/documentation but also skip or rerun the models as configured, enabling you to version control your logic. Reasonable guardrails for the modern data stack. ✨ + +Depending on how you orchestrate your materialized views, you can either run the testing in production as part of a scheduled job (with dbt test or dbt build). + +## Conclusion + +Well, I’m excited for everyone to remove the lines in your packages.yml that installed your experimental package (at least if you’re using it for MVs) and start to get your hands dirty. We are still new in our journey and I look forward to hearing all the things you are creating and how we can better our best practices in this. \ No newline at end of file diff --git a/website/blog/authors.yml b/website/blog/authors.yml index b8437601b5b..2e554ffc814 100644 --- a/website/blog/authors.yml +++ b/website/blog/authors.yml @@ -1,372 +1,535 @@ -david_krevitt: - name: David Krevitt - job_title: Marketing - image_url: /img/blog/authors/dkrevitt.jpg - organization: dbt Labs - description: Laziness is a virtue +amy_chen: + image_url: /img/blog/authors/achen.png + job_title: Staff Partner Engineer links: - - url: https://twitter.com/dkrevitt - icon: fa-twitter - -jason_ganz: - name: Jason Ganz - job_title: Developer Experience + - icon: fa-linkedin + url: https://www.linkedin.com/in/yuanamychen/ + name: Amy Chen organization: dbt Labs - image_url: /img/blog/authors/jasnonaz.jpeg - links: - - url: https://twitter.com/jasnonaz - icon: fa-twitter -sanjana_sen: - name: Sanjana Sen +andrew_escay: + image_url: /img/blog/authors/aescay.jpeg job_title: Analytics Engineer - organization: dbt Labs - image_url: /img/blog/authors/sanjana.png links: - - url: https://twitter.com/sqlsanj - icon: fa-twitter - -lauren_craigie: - name: Lauren Craigie - job_title: Product Marketing + - icon: fa-twitter + url: https://twitter.com/_aescay + name: Andrew Escay organization: dbt Labs - image_url: /img/blog/authors/craigie.png - links: - - url: https://twitter.com/crai_fish - icon: fa-twitter -andrew_escay: - name: Andrew Escay +arthur_marcon: + image_url: /img/blog/authors/arthur-marcon.png job_title: Analytics Engineer - organization: dbt Labs - image_url: /img/blog/authors/aescay.jpeg - links: - - url: https://twitter.com/_aescay - icon: fa-twitter + name: Arthur Marcon + organization: Indicium Tech + description: Arthur Marcon is an Analytics Engineer at Indicium, helping data-driven companies transition to the modern data analytics stack in the cloud through dbt. Arthur is also an enthusiast of technology and innovation. He is mainly interested in how data plays a key role to enable companies to achieve their full innovative and technological potential. Outside of work, he is a movie aficionado, and likes to hit the gym and read Agatha Christie's novels. -jess_williams: - name: Jess Williams - job_title: Head of Professional Services - organization: dbt Labs - image_url: /img/blog/authors/jess.png +barr_yaron: + image_url: /img/blog/authors/barr-yaron.png + job_title: Product Manager links: - - url: https://www.linkedin.com/in/jessdwilliams/ - icon: fa-linkedin - -pat_kearns: - name: Pat Kearns - job_title: Senior Analytics Engineer + - icon: fa-linkedin + url: https://www.linkedin.com/in/barryaron/ + name: Barr Yaron organization: dbt Labs - image_url: /img/blog/authors/pkearns.png + +bennie_regenold: + image_url: /img/blog/authors/bennie-regenold.png + job_title: Analytics Engineer links: - - url: https://www.linkedin.com/in/pat-kearns/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/benjaminregenold/ + name: Bennie Regenold + organization: dbt Labs -amy_chen: - name: Amy Chen - job_title: Senior Partner Engineer +benoit_perigaud: + image_url: /img/blog/authors/benoit-perigaud.jpeg + job_title: Staff Analytics Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/benoit-perigaud/ + name: Benoit Perigaud organization: dbt Labs - image_url: /img/blog/authors/achen.png + +brittany_krauth: + description: Brittany Krauth works as Manager, Analytics & Insights supporting Degreed's upskilling platform. Brittany is passionate about building a company-wide data-driven culture. She has worked in various analytical roles, from a focus on predictive analytics to data visualization to process improvements. In addition, she holds a BS in Industrial Engineering from Georgia Tech. In her spare time, Brittany competes in dog agility and trains donkeys. + image_url: /img/blog/authors/brittany-krauth.png + job_title: Manager, Analytics & Insights links: - - url: https://www.linkedin.com/in/yuanamychen/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/brittanykrauth + name: Brittany Krauth + organization: Degreed -joel_labes: - name: Joel Labes +callum_mccann: + description: Callum works on metrics and is either talking about that or obsessing about his dog. + image_url: /img/blog/authors/callum-mccann.jpg job_title: Senior Developer Experience Advocate + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/callum-mccann-38628a89/ + - icon: fa-twitter + url: https://twitter.com/callumpmccann + name: Callum McCann organization: dbt Labs - image_url: /img/blog/authors/jlabes.png + +callie_white: + description: Callie White is a Senior Analytics Consultant at Montreal Analytics where she leads projects in partnership with client data teams to achieve the best-practice and best-suited results. She’s also passionate about mentoring developers and demystifying the complexity of a role in data. Callie uses data-driven decision-making in everything she does; mainly to maximize cuddle time with her rescue pup named Snacks, or to scope out her next hiking trip. Montreal Analytics is a full-stack data consultancy servicing North America, and are both a dbt Preferred Consulting Partner and Platinum dbt Certification Award winner. We help organizations on the whole data journey, from strategic advisory to hands-on development and enablement. Whether deploying greenfield data platforms, tackling complex migrations or auditing & refactoring entangled data models, we build scalable solutions for organizations of all sizes. If you are interested in learning more about how we help clients across the modern stack, contact us today. + image_url: /img/blog/authors/callie-white.jpg + job_title: Analytics Consultant + name: Callie White + organization: Montreal Analytics + +charlie_summers: + description: Charlie is the Data Engineer Tech Lead at Merit. He introduced Merit to dbt and it's been a fantastic fit for a wide variety of data pipelines. He likes thinking about the future of data - integrating event streams, analyzing encrypted data, capturing fine-grained lineage, and making it easy to build simple apps on top of data warehouses/lakes. + image_url: /img/blog/authors/charlie-summers.jpeg + job_title: Staff Software Engineer links: - - url: https://www.linkedin.com/in/joel-labes/ - icon: fa-linkedin - - url: https://twitter.com/joellabes - icon: fa-twitter + - icon: fa-linkedin + url: https://www.linkedin.com/in/charliesummers + name: Charlie Summers + organization: Merit -claire_carroll: - name: Claire Carroll +christian_vanbellen: + image_url: /img/blog/authors/christian-van-bellen.JPG job_title: Analytics Engineer - organization: analyticsengineers.club + name: Christian van Bellen + organization: Indicium Tech + description: Christian van Bellen is an Analytics Engineer at Indicium Tech, acting mainly in projects focused on Enterprise-level data stack transition. Also works with the DataScience/ML stack. Christian started using dbt in 2022 to help implement a modern data stack within enterprises, and is a certified dbt developer. + +christine_berger: + image_url: /img/blog/authors/christine-berger.jpeg + job_title: Senior Analytics Engineer + name: Christine Berger + organization: dbt Labs + +christophe_oudar: + image_url: /img/blog/authors/christophe-oudar.png + job_title: Staff Software Engineer + name: Christophe Oudar + organization: Teads + +claire_carroll: image_url: /img/blog/authors/claire.jpeg + job_title: Analytics Engineer links: - - url: https://twitter.com/clairebcarroll - icon: fa-twitter - - url: https://github.com/clrcrl - icon: fa-github - - url: https://www.linkedin.com/in/clrcrl/ - icon: fa-linkedin + - icon: fa-twitter + url: https://twitter.com/clairebcarroll + - icon: fa-github + url: https://github.com/clrcrl + - icon: fa-linkedin + url: https://www.linkedin.com/in/clrcrl/ + name: Claire Carroll + organization: analyticsengineers.club dave_connors: + image_url: /img/blog/authors/dconnors.jpeg + job_title: Senior Developer Experience Advocate + links: + - icon: fa-github + url: https://github.com/dave-connors-3 + - icon: fa-linkedin + url: https://www.linkedin.com/in/daveconnors3/ name: Dave Connors - job_title: Senior Analytics Engineer organization: dbt Labs - image_url: /img/blog/authors/dconnors.jpeg + +david_krevitt: + description: Laziness is a virtue + image_url: /img/blog/authors/dkrevitt.jpg + job_title: Marketing links: - - url: https://github.com/dave-connors-3 - icon: fa-github - - url: https://www.linkedin.com/in/daveconnors3/ - icon: fa-linkedin + - icon: fa-twitter + url: https://twitter.com/dkrevitt + name: David Krevitt + organization: dbt Labs -ross_turk: - name: Ross Turk - job_title: VP Marketing - organization: Datakin - image_url: /img/blog/authors/ross-turk.png +donny_flynn: + description: Donny Flynn is a customer data architect at Census - a reverse ETL product that helps companies operationalize analytics. Prior to joining Census, Donny built out the data stack at Owner and led a data team at Chiper. When not doing data work, he's most likely watching Chicago sports. + image_url: /img/blog/authors/dflynn.jpeg + job_title: Customer Data Architect links: - - url: https://mobile.twitter.com/rossturk - icon: fa-twitter - - url: https://github.com/rossturk - icon: fa-github + - icon: fa-twitter + url: https://twitter.com/donmandonguy + - icon: fa-linkedin + url: https://www.linkedin.com/in/donny-flynn-578149a4/ + name: Donny Flynn + organization: Census -sung_chung: - name: Sung Won Chung +doug_beatty: + image_url: /img/blog/authors/dbeatty.jpeg + job_title: Senior Developer Experience Advocate + name: Doug Beatty + organization: dbt Labs + +doug_guthrie: + image_url: /img/blog/authors/dguthrie.jpeg job_title: Solutions Architect + links: + - icon: fa-github + url: https://github.com/dpguthrie + - icon: fa-linkedin + url: https://www.linkedin.com/in/doug-guthrie-07994a48/ + name: Doug Guthrie organization: dbt Labs - image_url: /img/blog/authors/sung.jpeg + +emily_riederer: + description: Emily Riederer is a Senior Analytics Manager at Capital One where she leads a team delivering a portfolio of data products, enterprise analysis tools, and data science solutions to business partners. As part of the dbt community, she develops and maintains the dbtplyr package. In her spare time, Emily frequently writes about data (see emilyriederer.com, The R Markdown Cookbook, and 97 Things Every Data Engineer Should Know), reviews technical manuscripts for CRC Press, and supports open research software engineering as a member of the rOpenSci editorial board. + image_url: /img/blog/authors/emily-riederer.jpeg + job_title: Senior Manager Analytics + name: Emily Riederer + organization: Capital One links: - - url: https://www.linkedin.com/in/sungwonchung1/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/emilyriederer/ + - icon: fa-twitter + url: https://twitter.com/emilyriederer + - icon: fa-github + url: https://twitter.com/emilyriederer + - icon: fa-readme + url: https://emilyriederer.com -seth_rosen: - name: Seth Rosen - job_title: Co-Founder & CEO - organization: TopCoat Data - description: Seth Rosen is co-founder and CEO of TopCoat Data - a platform for helping organizations build analytical applications. Prior to founding TopCoat, Seth helped companies of all sizes build custom data apps on top of the modern data stack through his consultancy, Hashpath. When he’s not tweeting and thinking about data, he’s tweeting while parenting two toddlers. - image_url: /img/blog/authors/seth-rosen.jpeg +grace_goheen: + image_url: /img/blog/authors/grace-goheen.jpeg + job_title: Analytics Engineer links: - - url: https://twitter.com/sethrosen - icon: fa-twitter - - url: https://www.linkedin.com/in/sdrosen/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/gracegoheen/ + name: Grace Goheen + organization: dbt Labs -donny_flynn: - name: Donny Flynn - job_title: Customer Data Architect - organization: Census - description: Donny Flynn is a customer data architect at Census - a reverse ETL product that helps companies operationalize analytics. Prior to joining Census, Donny built out the data stack at Owner and led a data team at Chiper. When not doing data work, he's most likely watching Chicago sports. - image_url: /img/blog/authors/dflynn.jpeg +ian_fahey: + image_url: /img/blog/authors/ian-fahey.png + job_title: Analytics Engineer links: - - url: https://twitter.com/donmandonguy - icon: fa-twitter - - url: https://www.linkedin.com/in/donny-flynn-578149a4/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/ianmfahey/ + - icon: fa-twitter + url: https://twitter.com/Cavorax + name: Ian Fahey + organization: dbt Labs izzy_erekson: - name: Izzy Erekson + image_url: /img/blog/authors/izzy.jpeg job_title: Solutions Architect + name: Izzy Erekson organization: dbt Labs - image_url: /img/blog/authors/izzy.jpeg -nate_sooter: - name: Nate Sooter - job_title: Manager of BI Operations - organization: Smartsheet - image_url: /img/blog/authors/nate-sooter.jpeg +jason_ganz: + image_url: /img/blog/authors/jasnonaz.jpeg + job_title: Developer Experience links: - - url: https://twitter.com/natesooter - icon: fa-twitter - - url: https://www.linkedin.com/in/nathansooter - icon: fa-linkedin + - icon: fa-twitter + url: https://twitter.com/jasnonaz + name: Jason Ganz + organization: dbt Labs -kira_furuichi: - name: Kira Furuichi - job_title: Technical Writer +jade_milaney: + description: Jade Milaney is an Analytics Consultant at Montreal Analytics where in with client data teams to implement data models and data engineering solutions. Jade studied mathematics in university, and has an interest in all things statistics and analytics. Montreal Analytics is a full-stack data consultancy servicing North America, and are both a dbt Preferred Consulting Partner and Platinum dbt Certification Award winner. We help organizations on the whole data journey, from strategic advisory to hands-on development and enablement. Whether deploying greenfield data platforms, tackling complex migrations or auditing & refactoring entangled data models, we build scalable solutions for organizations of all sizes. If you are interested in learning more about how we help clients across the modern stack, contact us today. + image_url: /img/blog/authors/jade-milaney.jpg + job_title: Analytics Consultant + name: Jade Milaney + organization: Montreal Analytics + +jeremy_cohen: + image_url: /img/blog/authors/jerco.jpeg + job_title: Product Manager + name: Jeremy Cohen organization: dbt Labs - image_url: /img/blog/authors/kira-furuichi.png - links: - - url: https://www.linkedin.com/in/kira-furuichi/ - icon: fa-linkedin -simon_podhajsky: - name: Simon Podhajsky - job_title: Data Lead - organization: iLife Technologies - description: > - Simon Podhajsky is a lapsed neuroscientist turned data everything at iLife - Technologies, a startup that seeks to digitize the life insurance agency. - image_url: /img/blog/authors/simon-podhajsky.jpeg +jess_williams: + image_url: /img/blog/authors/jess.png + job_title: Head of Professional Services links: - - url: https://www.linkedin.com/in/simonpodhajsky/ - icon: fa-linkedin - - url: https://twitter.com/sim_pod - icon: fa-twitter + - icon: fa-linkedin + url: https://www.linkedin.com/in/jessdwilliams/ + name: Jess Williams + organization: dbt Labs -viraj_parekh: - name: Viraj Parekh - job_title: Field CTO - organization: Astronomer - image_url: /img/blog/authors/viraj-parekh.jpeg +joao_antunes: + image_url: /img/blog/authors/joao_antunes.jpg + job_title: Lead Data Engineer, Global Product Strategy links: - - url: https://www.linkedin.com/in/viraj-parekh-46114689/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/jo%C3%A3o-bernardo-pires-antunes + name: João Antunes + organization: Roche -josh_fell: - name: Josh Fell - job_title: Ecosystem Engineer - organization: Astronomer - image_url: /img/blog/authors/josh-fell.jpeg +joe_markiewicz: + description: Joe is a dbt package maintainer/manager at Fivetran by day, and a cat obsessed video game developer by night. Actually, Joe is cat obsessed all day. + image_url: /img/blog/authors/joe-markiewicz.jpeg + job_title: Analytics Engineering Manager (Fivetran dbt package maintainer) links: - - url: https://www.linkedin.com/in/josh-fell/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/joseph-markiewicz-8224a990/ + - icon: fa-twitter + url: https://twitter.com/JoeMarkiewicz17 + name: Joe Markiewicz + organization: Fivetran -simo_tumelius: - name: Simo Tumelius - job_title: Freelance Data and Analytics Engineer - image_url: /img/blog/authors/simo-tumelius.jpeg +joel_labes: + image_url: /img/blog/authors/jlabes.png + job_title: Senior Developer Experience Advocate links: - - url: https://www.linkedin.com/in/simo-tumelius-00a27a162/ - icon: fa-linkedin - -matt_winkler: - name: Matt Winkler - job_title: Senior Solutions Architect + - icon: fa-linkedin + url: https://www.linkedin.com/in/joel-labes/ + - icon: fa-twitter + url: https://twitter.com/joellabes + name: Joel Labes organization: dbt Labs - description: Matt is an ex-data scientist who chose to embrace the simplicity of using SQL to manage and testing data pipelines with dbt. He previously worked as a hands-on ML practitioner, and consulted with Fortune 500 clients to build and maintain ML Ops pipelines using (mostly) AWS Sagemaker. He lives in the Denver area, and you can say hello on dbt Slack or on LinkedIn. - image_url: /img/blog/authors/matt-winkler.jpeg - links: - - url: https://www.linkedin.com/in/matt-winkler-4024263a/ jonathan_natkins: - name: Jon "Natty" Natkins - job_title: Regional Director, Solutions Architecture - organization: dbt Labs description: Natty also writes about startups, equity, data, and more in his Substack called [Semi-Structured](http://semistructured.substack.com/). image_url: /img/blog/authors/jonathan-natkins.jpeg + job_title: Regional Director, Solutions Architecture links: - - url: https://www.linkedin.com/in/nattyice/ - icon: fa-linkedin - - url: https://twitter.com/nattyice - icon: fa-twitter - -lauren_benezra: - name: Lauren Benezra - job_title: Analytics Engineer + - icon: fa-linkedin + url: https://www.linkedin.com/in/nattyice/ + - icon: fa-twitter + url: https://twitter.com/nattyice + name: Jon "Natty" Natkins organization: dbt Labs - image_url: /img/blog/authors/lauren-benezra.jpeg + +josh_fell: + image_url: /img/blog/authors/josh-fell.jpeg + job_title: Ecosystem Engineer links: - - url: https://www.linkedin.com/in/lbenezra/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/josh-fell/ + name: Josh Fell + organization: Astronomer -christine_berger: - name: Christine Berger - job_title: Senior Analytics Engineer +kira_furuichi: + image_url: /img/blog/authors/kira-furuichi.png + job_title: Technical Writer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/kira-furuichi/ + name: Kira Furuichi organization: dbt Labs - image_url: /img/blog/authors/christine-berger.jpeg -grace_goheen: - name: Grace Goheen +lauren_benezra: + image_url: /img/blog/authors/lauren-benezra.jpeg job_title: Analytics Engineer - organization: dbt Labs - image_url: /img/blog/authors/grace-goheen.jpeg links: - - url: https://www.linkedin.com/in/gracegoheen/ - icon: fa-linkedin - -jeremy_cohen: - name: Jeremy Cohen - job_title: Product Manager + - icon: fa-linkedin + url: https://www.linkedin.com/in/lbenezra/ + name: Lauren Benezra organization: dbt Labs - image_url: /img/blog/authors/jerco.jpeg -doug_beatty: - name: Doug Beatty - job_title: Senior Developer Experience Advocate +lauren_craigie: + image_url: /img/blog/authors/craigie.png + job_title: Product Marketing + links: + - icon: fa-twitter + url: https://twitter.com/crai_fish + name: Lauren Craigie organization: dbt Labs - image_url: /img/blog/authors/dbeatty.jpeg -callum_mccann: - name: Callum McCann - job_title: Senior Developer Experience Advocate +lucas_bergodias: + image_url: /img/blog/authors/lucas-dias.jpg + job_title: Analytics Engineer + name: Lucas Bergo Dias + organization: Indicium Tech + description: Lucas Dias works as an Analytics Engineer at Indicium Tech, a leading technology consultancy that specializes in data-driven solutions. Lucas brings a wealth of expertise and experience to the role, having worked in the analytics field for two years. At Indicium, he assists companies in transitioning to modern data stack through the use of dbt. He's passionate about using data to drive innovation and growth, and primarily works on enterprise-level projects. + +matt_winkler: + description: Matt is an ex-data scientist who chose to embrace the simplicity of using SQL to manage and testing data pipelines with dbt. He previously worked as a hands-on ML practitioner, and consulted with Fortune 500 clients to build and maintain ML Ops pipelines using (mostly) AWS Sagemaker. He lives in the Denver area, and you can say hello on dbt Slack or on LinkedIn. + image_url: /img/blog/authors/matt-winkler.jpeg + job_title: Senior Solutions Architect + links: + - url: https://www.linkedin.com/in/matt-winkler-4024263a/ + name: Matt Winkler organization: dbt Labs - description: Callum works on metrics and is either talking about that or obsessing about his dog. - image_url: /img/blog/authors/callum-mccann.jpg + +mikael_thorup: + image_url: /img/blog/authors/mikael-thorup.jpeg + job_title: Senior Analytics Engineer + name: Mikael Thorup + organization: Lunar + +nate_sooter: + image_url: /img/blog/authors/nate-sooter.jpeg + job_title: Manager of BI Operations links: - - url: https://www.linkedin.com/in/callum-mccann-38628a89/ - icon: fa-linkedin - - url: https://twitter.com/callumpmccann - icon: fa-twitter + - icon: fa-twitter + url: https://twitter.com/natesooter + - icon: fa-linkedin + url: https://www.linkedin.com/in/nathansooter + name: Nate Sooter + organization: Smartsheet -benoit_perigaud: - name: Benoit Perigaud +noah_kennedy: + image_url: /img/blog/authors/noah-kennedy.png + job_title: Software Developer + description: Noah Kennedy is a Software Developer at Tempus Labs, helping manage the worlds largest repository of clinical and molecular patient data. Noah picked up dbt at Tempus 2 years ago, and has worked extensively on internal dbt development for Tempus — creating macros, custom materializations, and testing frameworks that allow scientists to experiment more and data engineer less. Outside of Tempus, Noah spends his time running, biking, playing music, and exploring coffee shops. You can find more of his writing at https://noahkennedy.co/about/. + organization: Tempus + name: Noah Kennedy + +pat_kearns: + image_url: /img/blog/authors/pkearns.png job_title: Senior Analytics Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/pat-kearns/ + name: Pat Kearns organization: dbt Labs - image_url: /img/blog/authors/benoit-perigaud.jpeg + +pedro_brito_de_sa: + image_url: /img/blog/authors/pedro_brito.jpeg + job_title: Product Analyst links: - - url: https://www.linkedin.com/in/benoit-perigaud/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/pbritosa/ + name: Pedro Brito de Sa + organization: Sage -bennie_regenold: - name: Bennie Regenold + +rastislav_zdechovan: + image_url: /img/blog/authors/rastislav-zdechovan.png job_title: Analytics Engineer - organization: dbt Labs - image_url: /img/blog/authors/bennie-regenold.png links: - - url: https://www.linkedin.com/in/benjaminregenold/ - icon: fa-linkedin + - icon: fa-linkedin + url: https://www.linkedin.com/in/rastislav-zdechovan/ + name: Rastislav Zdechovan + organization: Infinite Lambda -barr_yaron: - name: Barr Yaron - job_title: Product Manager - organization: dbt Labs - image_url: /img/blog/authors/barr-yaron.png +ross_turk: + image_url: /img/blog/authors/ross-turk.png + job_title: VP Marketing links: - - url: https://www.linkedin.com/in/barryaron/ - icon: fa-linkedin + - icon: fa-twitter + url: https://mobile.twitter.com/rossturk + - icon: fa-github + url: https://github.com/rossturk + name: Ross Turk + organization: Datakin -ian_fahey: - name: Ian Fahey +sam_harting: + description: Sam is a graduate of the inaugural Foundry Program. When Sam isn't helping clients build out their project, he is usually playing video games or making bad jokes to his friends, family, and coworkers. + image_url: /img/blog/authors/sam-harting.png + job_title: Associate Analytics Engineer + name: Samuel Harting + organization: dbt Labs + +sanjana_sen: + image_url: /img/blog/authors/sanjana.png job_title: Analytics Engineer + links: + - icon: fa-twitter + url: https://twitter.com/sqlsanj + name: Sanjana Sen organization: dbt Labs - image_url: /img/blog/authors/ian-fahey.png + +sean_mcintyre: + image_url: /img/blog/authors/sean_mcintyre.jpg + job_title: Senior Solutions Architect links: - - url: https://www.linkedin.com/in/ianmfahey/ - icon: fa-linkedin - - url: https://twitter.com/Cavorax - icon: fa-twitter + - icon: fa-linkedin + url: https://www.linkedin.com/in/boxysean + name: Sean McIntyre + organization: dbt Labs -joe_markiewicz: - name: Joe Markiewicz - job_title: Analytics Engineering Manager (Fivetran dbt package maintainer) - description: Joe is a dbt package maintainer/manager at Fivetran by day, and a cat obsessed video game developer by night. Actually, Joe is cat obsessed all day. - organization: Fivetran - image_url: /img/blog/authors/joe-markiewicz.jpeg +seth_rosen: + description: Seth Rosen is co-founder and CEO of TopCoat Data - a platform for helping organizations build analytical applications. Prior to founding TopCoat, Seth helped companies of all sizes build custom data apps on top of the modern data stack through his consultancy, Hashpath. When he’s not tweeting and thinking about data, he’s tweeting while parenting two toddlers. + image_url: /img/blog/authors/seth-rosen.jpeg + job_title: Co-Founder & CEO links: - - url: https://www.linkedin.com/in/joseph-markiewicz-8224a990/ - icon: fa-linkedin - - url: https://twitter.com/JoeMarkiewicz17 - icon: fa-twitter + - icon: fa-twitter + url: https://twitter.com/sethrosen + - icon: fa-linkedin + url: https://www.linkedin.com/in/sdrosen/ + name: Seth Rosen + organization: TopCoat Data -yu_ishikawa: - name: Yu Ishikawa - job_title: Senior Data Privacy Engineer - organization: Ubie - image_url: /img/blog/authors/yu-ishikawa.jpg - links: - - url: https://www.linkedin.com/in/yuishikawa0301 - icon: fa-linkedin +simo_tumelius: + image_url: /img/blog/authors/simo-tumelius.jpeg + job_title: Freelance Data and Analytics Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/simo-tumelius-00a27a162/ + name: Simo Tumelius -brittany_krauth: - name: Brittany Krauth - job_title: Manager, Analytics & Insights - description: Brittany Krauth works as Manager, Analytics & Insights supporting Degreed's upskilling platform. Brittany is passionate about building a company-wide data-driven culture. She has worked in various analytical roles, from a focus on predictive analytics to data visualization to process improvements. In addition, she holds a BS in Industrial Engineering from Georgia Tech. In her spare time, Brittany competes in dog agility and trains donkeys. - organization: Degreed - image_url: /img/blog/authors/brittany-krauth.png - links: - - url: https://www.linkedin.com/in/brittanykrauth - icon: fa-linkedin +simon_podhajsky: + description: | + Simon Podhajsky is a lapsed neuroscientist turned data everything at iLife Technologies, a startup that seeks to digitize the life insurance agency. + image_url: /img/blog/authors/simon-podhajsky.jpeg + job_title: Data Lead + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/simonpodhajsky/ + - icon: fa-twitter + url: https://twitter.com/sim_pod + name: Simon Podhajsky + organization: iLife Technologies -charlie_summers: - name: Charlie Summers - job_title: Staff Software Engineer - description: Charlie is the Data Engineer Tech Lead at Merit. He introduced Merit to dbt and it's been a fantastic fit for a wide variety of data pipelines. He likes thinking about the future of data - integrating event streams, analyzing encrypted data, capturing fine-grained lineage, and making it easy to build simple apps on top of data warehouses/lakes. - organization: Merit - image_url: /img/blog/authors/charlie-summers.jpeg - links: - - url: https://www.linkedin.com/in/charliesummers - icon: fa-linkedin +sterling_paramore: + description: | + Sterling Paramore started his career in theoretical and computation biophysics and learned that working with data was way more fun than being locked in the ivory tower. He loves solving data engineering and data analytics problems and has been a long time evangelist for dbt. + image_url: /img/blog/authors/sterling-paramore.png + job_title: Sr Staff Data Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/sterlingparamore/ + name: Sterling Paramore + organization: Mainspring Energy + +sung_chung: + image_url: /img/blog/authors/sung.jpeg + job_title: Solutions Architect + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/sungwonchung1/ + name: Sung Won Chung + organization: dbt Labs + +viraj_parekh: + image_url: /img/blog/authors/viraj-parekh.jpeg + job_title: Field CTO + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/viraj-parekh-46114689/ + name: Viraj Parekh + organization: Astronomer wasila_quader: - name: Wasila Quader - job_title: Associate Analytics Engineer - description: After a winding road through healthcare spreadsheets and data science projects, Wasila discovered analytics engineering as an apprentice of dbt Labs' Foundry Program. She now works as an analytics engineer on dbt Labs' professional services team. - organization: dbt Labs - image_url: /img/blog/authors/wasila-quader.png + description: After a winding road through healthcare spreadsheets and data science projects, Wasila discovered analytics engineering as an apprentice of dbt Labs' Foundry Program. She now works as an analytics engineer on dbt Labs' professional services team. + image_url: /img/blog/authors/wasila-quader.png + job_title: Associate Analytics Engineer + name: Wasila Quader + organization: dbt Labs + +yannick_misteli: + image_url: /img/blog/authors/yannick_misteli.jpg + job_title: Head of Engineering, Global Product Strategy + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/yannick-misteli-b86129105/ + name: Yannick Misteli + organization: Roche + +yu_ishikawa: + image_url: /img/blog/authors/yu-ishikawa.jpg + job_title: Senior Data Privacy Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/yuishikawa0301 + name: Yu Ishikawa + organization: Ubie + +jonathan_neo: + image_url: /img/blog/authors/jonathan-neo.png + job_title: Data Engineer + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/jonneo/ + name: Jonathan Neo + organization: Canva & Data Engineer Camp + description: Jonathan is a Data Engineer at Canva where he is building and maintaining petabyte-scale data platforms. Jonathan founded Data Engineer Camp, a bootcamp that empowers professionals to become proficient data engineers. He has since trained data and software professionals from around that are working at companies like Microsoft, Atlassian, and Apple. + +santiago_jauregui: + image_url: /img/blog/authors/santiago-jauregui.jpeg + job_title: Data Analytics Leader + links: + - icon: fa-linkedin + url: https://www.linkedin.com/in/santiago-jauregui/ + name: Santiago Jauregui + organization: MODO diff --git a/website/blog/categories.yml b/website/blog/categories.yml index 2a45e6529e2..8103f58cc33 100644 --- a/website/blog/categories.yml +++ b/website/blog/categories.yml @@ -15,10 +15,6 @@ display_title: dbt tutorials description: Best practices in the usage of our favorite data transformation tool. is_featured: true -- name: dbt updates - display_title: dbt product updates - description: An archive of monthly product updates from the dbt Labs team. - is_featured: true - name: SQL magic display_title: SQL magic description: Stories of dbt developers making SQL sing across warehouses. diff --git a/website/blog/ctas.yml b/website/blog/ctas.yml index 2267b05a42a..2e3170faae4 100644 --- a/website/blog/ctas.yml +++ b/website/blog/ctas.yml @@ -9,4 +9,9 @@ header: "Just Getting Started?" subheader: Check out guides on getting your warehouse set up and connected to dbt Cloud. button_text: Learn more - url: https://docs.getdbt.com/docs/get-started/getting-started/overview + url: https://docs.getdbt.com/quickstarts +- name: coalesce_2023_signup + header: Join data practitioners worldwide at Coalesce 2023 + subheader: Kicking off on October 16th, both online and in-person (Sydney, London, and San Diego) + button_text: Register now + url: https://coalesce.getdbt.com/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_coalesce-2023_aw&utm_content=coalesce____&utm_term=all___ \ No newline at end of file diff --git a/website/blog/maching-learning-dbt-baton-pass.md b/website/blog/maching-learning-dbt-baton-pass.md index 7046ecb296a..2c38cfd8983 100644 --- a/website/blog/maching-learning-dbt-baton-pass.md +++ b/website/blog/maching-learning-dbt-baton-pass.md @@ -48,7 +48,7 @@ The ML engineer stepped in from here. She was used to doing her statistics and p ![image of table with macro names and connector compatibility](/img/blog/2022-02-18-machine-learning-dbt-baton-pass/macro-names.png) -The ML engineer got the preprocessing steps (think: one-hot encoding, feature scaling, imputation) finalized. She used SQL to read the dbt models (tables) into a Jupyter notebook to perform model training. After iterating on the machine learning models and tracking model fit (think: AUC/Precision/Recall (for classification)), she ran the model over dbt-created tables and output the predicted results as a table in the database. To keep documentation clean, she [configured a source](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources/) within the dbt project to reflect this predicted results table. It wasn’t intuitive, but it was better than leaving it out of dbt docs. +The ML engineer got the preprocessing steps (think: one-hot encoding, feature scaling, imputation) finalized. She used SQL to read the dbt models (tables) into a Jupyter notebook to perform model training. After iterating on the machine learning models and tracking model fit (think: AUC/Precision/Recall (for classification)), she ran the model over dbt-created tables and output the predicted results as a table in the database. To keep documentation clean, she [configured a source](/docs/build/sources) within the dbt project to reflect this predicted results table. It wasn’t intuitive, but it was better than leaving it out of dbt docs. Finally, she created a dashboard on top of this table to publicize model accuracy over time to end users. To schedule this, we went to the data engineer to string together the above in [Airflow](https://discourse.getdbt.com/t/orchestrating-fivetran-and-dbt-with-airflow/2079) everyday at 8am and called it done. @@ -97,7 +97,7 @@ Gluing together notebooks and dbt isn’t the most elegant experience today. It #### How would this change my story? My ML engineer would know the quality of input data created by dbt before starting machine learning development. I could schedule this notebook in sync with my dbt jobs and know instantly if my **ML model drift is caused by data quality vs. model logic.** -Also, I would create a data app (in Hex) where users plug in different input scenarios that feed into the predictive model. Even better, I could track versions of my ML models deployed over time in Modelbit + Hex and deploy ML external functions as [dbt macros](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros/#macros) +Also, I would create a data app (in Hex) where users plug in different input scenarios that feed into the predictive model. Even better, I could track versions of my ML models deployed over time in Modelbit + Hex and deploy ML external functions as [dbt macros](/docs/build/jinja-macros) (by the way: how is this not more normal?!). ![Image showing the notebook and dbt synchronization](/img/blog/2022-02-18-machine-learning-dbt-baton-pass/notebook-dbt-sync.png) diff --git a/website/blog/metadata.yml b/website/blog/metadata.yml index 549e5dd04d2..a5afa86e667 100644 --- a/website/blog/metadata.yml +++ b/website/blog/metadata.yml @@ -2,16 +2,7 @@ featured_image: "" # This CTA lives in right sidebar on blog index -featured_cta: "staging" - -# How many featured posts to show on blog index -featured_posts_count: 4 - -# How many featured posts by tag to show on blog index -featured_tag_posts_count: 3 - -# How many posts which aren't in featured post section should show -regular_posts_count: 15 +featured_cta: "coalesce_2023_signup" # Show or hide hero title, description, cta from blog index show_title: true @@ -19,6 +10,3 @@ show_description: true hero_button_url: "/blog/welcome" hero_button_text: "Start here" hero_button_new_tab: false - -# Show or hide sidebar on blog post page -show_left_sidebar: false diff --git a/website/blog/2021-11-23-on-the-importance-of-naming.md b/website/blog/src.md similarity index 100% rename from website/blog/2021-11-23-on-the-importance-of-naming.md rename to website/blog/src.md diff --git a/website/cypress.config.js b/website/cypress.config.js index 0ce0dfeda3c..bd3f975cbb4 100644 --- a/website/cypress.config.js +++ b/website/cypress.config.js @@ -1,12 +1,18 @@ const { defineConfig } = require("cypress"); - module.exports = defineConfig({ e2e: { setupNodeEvents(on, config) { // implement node event listeners here }, - baseUrl: 'https://docs.getdbt.com', + baseUrl: "https://docs.getdbt.com", chromeWebSecurity: false, - video: false + video: false, + }, + + component: { + devServer: { + framework: "react", + bundler: "webpack", + }, }, }); diff --git a/website/cypress/e2e/docs.cy.js b/website/cypress/e2e/docs.cy.js deleted file mode 100644 index 50f0ba09dae..00000000000 --- a/website/cypress/e2e/docs.cy.js +++ /dev/null @@ -1,543 +0,0 @@ -describe('docs.getdbt.com docs tab', () => { - before(function () { - Cypress.on('uncaught:exception', (err, runnable) => { - // returning false here prevents Cypress from - // failing the test - return false; - }); - }); - - beforeEach(() => { - cy.visit('/docs/introduction'); - }); - - it('verifies all the introduction page links work and go to the correct pages', () => { - cy.get(':nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/supported-data-platforms`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(1) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/core-versions`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the building a dbt project page links work and go to the correct pages', () => { - cy.get('.theme-doc-sidebar-menu > :nth-child(2) > :nth-child(1) > .menu__link').click() - cy.get(':nth-child(2) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/projects`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(2) > .menu__list > .theme-doc-sidebar-item-category > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/materializations`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/configuring-incremental-models`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/using-custom-aliases`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/using-custom-schemas`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/using-custom-databases`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > .theme-doc-sidebar-item-category > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/building-models/using-variables`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/tests`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/documentation`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/using-sources`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/seeds`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/snapshots`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/exposures`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(9) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/jinja-macros`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(10) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/hooks-operations`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(11) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/package-management`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(12) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/analyses`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 350ms ease-in-out 0s;"] > :nth-child(13) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/building-a-dbt-project/metrics`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the running a dbt project page links work and go to the correct pages', () => { - cy.get(':nth-child(3) > .menu__list-item-collapsible > .menu__link').click() - cy.get(':nth-child(3) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/running-a-dbt-project/using-the-dbt-ide`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(3) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/running-a-dbt-project/using-the-cli`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(3) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/running-a-dbt-project/dbt-api`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(3) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/running-a-dbt-project/running-dbt-in-production`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the contributing page links work and go to the correct pages', () => { - cy.get(':nth-child(4) > .menu__list-item-collapsible > .menu__link').click() - cy.get(':nth-child(4) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/oss-expectations`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(4) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/contributor-license-agreements`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(4) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/building-a-new-adapter`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(4) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/testing-a-new-adapter`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(4) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/documenting-a-new-adapter`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(4) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/contributing/slack-rules-of-the-road`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the about page links work and go to the correct pages', () => { - cy.get(':nth-child(5) > .menu__list-item-collapsible > .menu__link').click() - cy.get(':nth-child(5) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/about/license`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get(':nth-child(5) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/about/viewpoint`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - accounts page links work and go to the correct pages', () => { - // frequently asked questions collapsible section takes the user - // to an index page - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/change-billing`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/configurable-snapshot-path`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/dbt-specific-jinja`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/git-account-in-use`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/payment-accepted`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(1) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Accounts/slack`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - core page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(2) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(2) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Core/install-pip-best-practices.md`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(2) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Core/install-pip-os-prereqs.md`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(2) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Core/install-python-compatibility`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - docs page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Docs/document-all-columns`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Docs/document-other-resources`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Docs/documenting-macros`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Docs/long-descriptions`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(3) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Docs/sharing-documentation`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - environments page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Environments/beta-release`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Environments/diff-database-environment`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Environments/profile-env-vars`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Environments/profile-name`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(4) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Environments/target-names`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - git page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/gitignore`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/gitlab-authentication`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/gitlab-selfhosted`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/google-cloud-repo`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/managed-repo`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(5) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Git/run-on-pull`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - jinja page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(6) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Jinja/jinja-whitespace`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(6) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Jinja/quoting-column-names`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(6) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Jinja/which-jinja-docs`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - models page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/available-configurations`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/available-materializations`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/configurable-model-path`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/create-a-schema`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/create-dependencies`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/insert-records`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/model-custom-schemas`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/reference-models-in-another-project`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(9) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/removing-deleted-models`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(10) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/run-downtime`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(11) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/source-quotes`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(12) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/specifying-column-types`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(13) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/sql-dialect`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(7) > .menu__list > :nth-child(14) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Models/unique-model-names`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - project page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/dbt-source-freshness`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/debugging-jinja`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/define-a-column-type`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/docs-for-multiple-projects`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/example-projects`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/exclude-table-from-freshness`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/multiple-resource-yml-files`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/project-name`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(9) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/properties-not-in-config`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(10) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/resource-yml-name`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(11) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/schema-yml-name`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(12) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/separate-profile`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(13) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/source-has-bad-name`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(14) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/source-in-different-database`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(15) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/structure-a-project`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(16) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/which-materialization`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(17) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/which-schema`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(18) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/why-not-write-dml`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(19) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/why-so-many-macros`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(20) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/why-version-2`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(8) > .menu__list > :nth-child(21) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Project/yaml-file-extension`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - runs page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/checking-logs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/failed-prod-run`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/failed-tests`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/run-downstream-of-seed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/run-one-model`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/run-one-snapshot`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/running-model-downstream-of-source`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(9) > .menu__list > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Runs/snapshot-frequency`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - seeds page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/build-one-seed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/full-refresh-seed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/leading-zeros-in-seed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/load-raw-data-with-seed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/seed-custom-schemas`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/seed-datatypes`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(10) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Seeds/seed-hooks`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - snapshots page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(11) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(11) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Snapshots/snapshot-hooks`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(11) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Snapshots/snapshot-schema-changes`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(11) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Snapshots/snapshot-target-schema`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(11) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Snapshots/snapshotting-freshness-for-one-source`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - tests page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/available-tests`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/configurable-data-path`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/configurable-data-test-path`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/custom-test-thresholds`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/recommended-tests`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/test-one-model`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/testing-seeds`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/testing-sources`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(9) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/uniqueness-two-columns`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(12) > .menu__list > :nth-child(10) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Tests/when-to-test`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - troubleshooting page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/access-gdrive-credential`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/access_token_error`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/dispatch-could-not-find-package`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/git-revlist-error`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/gitignore`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/gitlab-authentication`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(7) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/nonetype-ide-error`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(8) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/partial-parsing-error`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(9) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/runtime-error-could-not-find-profile`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(10) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/runtime-packages.yml`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(11) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/sql-errors`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(13) > .menu__list > :nth-child(12) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Troubleshooting/unused-model-configurations`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) - - it('verifies all the frequently asked questions - warehouse page links work and go to the correct pages', () => { - cy.get(':nth-child(6) > .menu__list-item-collapsible > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list-item-collapsible > .menu__link').click() - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(1) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/bq-impersonate-service-account-setup`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(2) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/bq-impersonate-service-account-why`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(3) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/connecting-to-two-dbs-not-allowed`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(4) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/database-privileges`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(5) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/loading-data`, `${Cypress.config('baseUrl')}/docs/introduction`) - - cy.get('[style="display: block; overflow: visible; height: auto; will-change: height; transition: height 357ms ease-in-out 0s;"] > :nth-child(14) > .menu__list > :nth-child(6) > .menu__link').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/Warehouse/sample-profiles`, `${Cypress.config('baseUrl')}/docs/introduction`) - }) -}) \ No newline at end of file diff --git a/website/cypress/e2e/homepage.cy.js b/website/cypress/e2e/homepage.cy.js deleted file mode 100644 index 6ffee3454f0..00000000000 --- a/website/cypress/e2e/homepage.cy.js +++ /dev/null @@ -1,61 +0,0 @@ -const sizes = ['iphone-x', [768, 1024], [1280, 720]] - -describe('docs.getdbt.com homepage', () => { - before(function () { - Cypress.on('uncaught:exception', (err, runnable) => { - // returning false here prevents Cypress from - // failing the test - return false; - }); - }); - - beforeEach(() => { - cy.visit('/'); - }); - - sizes.forEach((size) => { - it('verifies all the button links work and go to the correct page', () => { - if (Cypress._.isArray(size)) { - cy.viewport(size[0], size[1]) - } else { - cy.viewport(size) - } - - cy.get('[style="max-width:var(--ifm-container-width);margin:calc(2vh) auto calc(2vh)"] > :nth-child(1) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/introduction`, `${Cypress.config('baseUrl')}/`) - - cy.get('[style="max-width:var(--ifm-container-width);margin:calc(2vh) auto calc(2vh)"] > :nth-child(2) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/guides/getting-started`, `${Cypress.config('baseUrl')}/`) - - cy.get('[style="max-width:var(--ifm-container-width);margin:calc(2vh) auto calc(2vh)"] > :nth-child(3) > .card > .card__footer > .button').invoke('removeAttr', 'target').click() - cy.checkLinksNotBroken('https://www.getdbt.com/dbt-learn/', `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(2) > :nth-child(1) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/introduction`, `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(2) > :nth-child(2) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/reference/dbt_project.yml`, `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(2) > :nth-child(3) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/faqs`, `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(4) > :nth-child(1) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/dbt-cloud/cloud-overview`, `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(4) > :nth-child(2) > .card > .card__footer > .button').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/dbt-cloud/dbt-cloud-api/cloud-apis`, `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(1) > .card > .card__footer > .button').contains('Get Advice').invoke('removeAttr', 'target').click() - cy.checkLinksNotBroken('https://discourse.getdbt.com/', `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(2) > .card > .card__footer > .button').contains('Join us on Slack').then(($button => { - cy.wrap($button).should('have.attr', 'href').and('eq', 'http://community.getdbt.com/') - cy.wrap($button).invoke('removeAttr', 'target').click() - })) - cy.checkLinksNotBroken('https://www.getdbt.com/community/', `${Cypress.config('baseUrl')}/`) - - cy.get(':nth-child(2) > :nth-child(3) > .card > .card__footer > .button').contains('View Projects').click() - cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/faqs/project/example-projects`, `${Cypress.config('baseUrl')}/`) - }) - }) -}) \ No newline at end of file diff --git a/website/cypress/e2e/sideMenuPageLinks.cy.js b/website/cypress/e2e/sideMenuPageLinks.cy.js new file mode 100644 index 00000000000..4b280a82a29 --- /dev/null +++ b/website/cypress/e2e/sideMenuPageLinks.cy.js @@ -0,0 +1,225 @@ +// This test is really expensive with time to run +// Note should not be part of ci but can be run locally. + +describe('docs.getdbt.com docs tab', () => { + before(function () { + Cypress.on('uncaught:exception', (err, runnable) => { + // returning false here prevents Cypress from + // failing the test + return false; + }); + }); + + beforeEach(() => { + cy.visit('/docs/introduction'); + }); + it('verifies sidebar menu has the correct level one items', () => { + // > li yields list items + const menuItems = [ + 'Supported data platforms', + 'dbt support', + 'Get started with dbt', + 'Build dbt projects', + 'Deploy dbt projects', + 'Collaborate with others', + 'Use the dbt Semantic Layer', + 'Available dbt versions', + 'dbt support', + 'Frequently asked questions', + ] + cy.get('.theme-doc-sidebar-menu > li').as('listItems') + + menuItems.forEach(item => { + cy.get('@listItems').contains(item).should('exist') + }) + }) + + it('verifies level one menu page links work and go to the correct pages', () => { + // > li yields liste items + cy.get('.theme-doc-sidebar-menu > li').as('listItems') + + cy.get('@listItems').contains('Supported data platforms').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/supported-data-platforms`, `${Cypress.config('baseUrl')}/docs/introduction`) + + cy.get('@listItems').contains('dbt support').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/dbt-support`, `${Cypress.config('baseUrl')}/docs/introduction`) + }) + + it('verifies /get started with dbt/ sub menu page links work and go to the correct pages', () => { + // > li yields liste items + const menuItems = [ + 'About getting started', + 'Get started with dbt Cloud', + 'Get started with dbt Core', + 'Run your dbt projects', + ] + const subMenuA = [ + 'About set up', + 'Getting set up', + 'Building your first project', + 'PrivateLink', + 'Learning more', + 'dbt Cloud features', + 'Develop in Cloud', + 'dbt Cloud tips', + ] + let items = [] + + cy.get('.theme-doc-sidebar-menu > li').as('listItems') + cy.get('@listItems').contains('Get started with dbt').as('listItem') + cy.get('@listItem').click() + cy.get('@listItem').parent().parent().as('subMenu') + + menuItems.forEach(item => { + cy.get('@subMenu').contains(item).should('be.visible') + }) + cy.get('@subMenu').contains('About getting started').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/getting-started/overview`, `${Cypress.config('baseUrl')}/docs/introduction`) + + cy.get('@subMenu').contains('Run your dbt projects').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/run-your-dbt-projects`, `${Cypress.config('baseUrl')}/docs/introduction`) + + // test all /Get started with dbt Cloud/ items are valid + cy.get('@subMenu').contains('Get started with dbt Cloud').click() + + cy.get('@subMenu').contains('About set up').as('subMenuItem') + cy.get('@subMenuItem').click() + cy.get('@subMenuItem').parent().parent().as('subMenuA') + subMenuA.forEach(item => { + cy.get('@subMenuA').contains(item).should('be.visible') + }) + + // verify /get set up/ page links work + cy.get('@subMenuA').contains('About set up').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/getting-started/set-up-dbt-cloud`, `${Cypress.config('baseUrl')}/docs/introduction`) + cy.get('@subMenuA').contains('dbt Cloud feature').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/dbt-cloud-features`, `${Cypress.config('baseUrl')}/docs/introduction`) + cy.get('@subMenuA').contains('Connect your database').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/connect-your-database`, `${Cypress.config('baseUrl')}/docs/introduction`) + cy.get('@subMenuA').contains('Develop in Cloud').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/develop-in-the-cloud`, `${Cypress.config('baseUrl')}/docs/introduction`) + cy.get('@subMenuA').contains('dbt Cloud tips').click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}/docs/get-started/dbt-cloud-tips`, `${Cypress.config('baseUrl')}/docs/introduction`) + + // verifies /Getting set up/ page links work + items = [ + { + title: 'Set up and connect BigQuery', + slug: '/docs/get-started/getting-started/getting-set-up/setting-up-bigquery' + }, + { + title: 'Set up and connect Databricks', + slug: '/docs/get-started/getting-started/getting-set-up/setting-up-databricks' + }, + { + title: 'Set up and connect Redshift', + slug: '/docs/get-started/getting-started/getting-set-up/setting-up-redshift' + }, + { + title: 'Set up and connect Snowflake', + slug: '/docs/get-started/getting-started/getting-set-up/setting-up-snowflake' + }, + + ] + // navigate to subMenu + cy.get('@subMenuA').contains('Getting set up').as('subMenuItemAA') + cy.get('@subMenuItemAA').click() + cy.get('@subMenuItemAA').parent().parent().as('subMenuAA') + // check visibility and links + items.forEach(item => { + cy.get('@subMenuAA').contains(item.title).should('be.visible') + cy.get('@subMenuAA').contains(item.title).click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}${item.slug}`, `${Cypress.config('baseUrl')}/docs/introduction`) + + }) + cy.get('@subMenuItemAA').click() + + // verifiest /Building your first project/ page links work + items = [ + { + title: 'Build your first models', + slug: '/docs/get-started/getting-started/building-your-first-project/build-your-first-models' + }, + { + title: 'Test and document your project', + slug: '/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project' + }, + { + title: 'Schedule a job', + slug: '/docs/get-started/getting-started/building-your-first-project/schedule-a-job' + }, + ] + // navigate to subMenu + cy.get('@subMenuA').contains('Building your first project').as('subMenuItemAB') + cy.get('@subMenuItemAB').click() + cy.get('@subMenuItemAB').parent().parent().as('subMenuAB') + // check visibility and links + items.forEach(item => { + cy.get('@subMenuAB').contains(item.title).should('be.visible') + cy.get('@subMenuAB').contains(item.title).click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}${item.slug}`, `${Cypress.config('baseUrl')}/docs/introduction`) + + }) + cy.get('@subMenuItemAB').click() + + //verifies /PrivateLink/ page links work + + items = [ + { + title: 'About PrivateLink', + slug: '/docs/get-started/privatelink/about-privatelink' + }, + { + title: 'PrivateLink for Snowflake', + slug: '/docs/get-started/privatelink/snowflake-privatelink' + }, + { + title: 'PrivateLink for Redshift', + slug: '/docs/get-started/privatelink/redshift-privatelink' + }, + { + title: 'PrivateLink for Databricks', + slug: '/docs/get-started/privatelink/databricks-privatelink' + }, + ] + // navigate to subMenu + cy.get('@subMenuA').contains('PrivateLink').as('subMenuItemAC') + cy.get('@subMenuItemAC').click() + cy.get('@subMenuItemAC').parent().parent().as('subMenuAC') + // check visibility and links + items.forEach(item => { + cy.get('@subMenuAC').contains(item.title).should('be.visible') + cy.get('@subMenuAC').contains(item.title).click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}${item.slug}`, `${Cypress.config('baseUrl')}/docs/introduction`) + + }) + cy.get('@subMenuItemAC').click() + + // verifies /learning more/ page links work + items = [ + { + title: 'Using Jinja', + slug: '/guides/advanced/using-jinja' + }, + { + title: 'Refactoring legacy SQL to dbt', + slug: '/guides/migration/tools/refactoring-legacy-sql' + }, + ] + // navigate to subMenu + cy.get('@subMenuA').contains('Learning more').as('subMenuItemAD') + cy.get('@subMenuItemAD').click() + cy.get('@subMenuItemAD').parent().parent().as('subMenuAD') + // check visibility and links + items.forEach(item => { + cy.get('@subMenuAD').contains(item.title).should('be.visible') + cy.get('@subMenuAD').contains(item.title).click() + cy.checkLinksNotBroken(`${Cypress.config('baseUrl')}${item.slug}`, `${Cypress.config('baseUrl')}/docs/introduction`) + + }) + cy.get('@subMenuItemAD').click() + + }) + + +}) diff --git a/website/cypress/fixtures/example.json b/website/cypress/fixtures/example.json new file mode 100644 index 00000000000..02e4254378e --- /dev/null +++ b/website/cypress/fixtures/example.json @@ -0,0 +1,5 @@ +{ + "name": "Using fixtures to represent data", + "email": "hello@cypress.io", + "body": "Fixtures are a great way to mock data for responses to routes" +} diff --git a/website/cypress/support/component-index.html b/website/cypress/support/component-index.html new file mode 100644 index 00000000000..ac6e79fd83d --- /dev/null +++ b/website/cypress/support/component-index.html @@ -0,0 +1,12 @@ + + + + + + + Components App + + +
+ + \ No newline at end of file diff --git a/website/cypress/support/component.js b/website/cypress/support/component.js new file mode 100644 index 00000000000..0d9eef9057d --- /dev/null +++ b/website/cypress/support/component.js @@ -0,0 +1,27 @@ +// *********************************************************** +// This example support/component.js is processed and +// loaded automatically before your test files. +// +// This is a great place to put global configuration and +// behavior that modifies Cypress. +// +// You can change the location of this file or turn off +// automatically serving support files with the +// 'supportFile' configuration option. +// +// You can read more here: +// https://on.cypress.io/configuration +// *********************************************************** + +// Import commands.js using ES2015 syntax: +import './commands' + +// Alternatively you can use CommonJS syntax: +// require('./commands') + +import { mount } from 'cypress/react' + +Cypress.Commands.add('mount', mount) + +// Example use: +// cy.mount() \ No newline at end of file diff --git a/website/dbt-versions.js b/website/dbt-versions.js index 03f2721e42d..910c4a6ef4d 100644 --- a/website/dbt-versions.js +++ b/website/dbt-versions.js @@ -1,4 +1,21 @@ exports.versions = [ + { + version: "1.7", + EOLDate: "2024-07-31", + isPrerelease: "true" + }, + { + version: "1.6", + EOLDate: "2024-07-31", + }, + { + version: "1.5", + EOLDate: "2024-04-27", + }, + { + version: "1.4", + EOLDate: "2024-01-25", + }, { version: "1.3", EOLDate: "2023-10-12", @@ -7,25 +24,141 @@ exports.versions = [ version: "1.2", EOLDate: "2023-07-26", }, +] + +exports.versionedPages = [ + { + "page": "reference/resource-configs/store_failures_as", + "firstVersion": "1.7", + }, { - version: "1.1", - EOLDate: "2023-04-28", + "page": "docs/build/build-metrics-intro", + "firstVersion": "1.6", }, { - version: "1.0", - EOLDate: "2022-12-03" + "page": "docs/build/sl-getting-started", + "firstVersion": "1.6", }, { - version: "0.21", - EOLDate: "2022-06-30" + "page": "docs/build/about-metricflow", + "firstVersion": "1.6", }, { - version: "0.20", - EOLDate: "2022-06-30" - } -] - -exports.versionedPages = [ + "page": "docs/build/join-logic", + "firstVersion": "1.6", + }, + { + "page": "docs/build/validation", + "firstVersion": "1.6", + }, + { + "page": "docs/build/semantic-models", + "firstVersion": "1.6", + }, + { + "page": "docs/build/group-by", + "firstVersion": "1.6", + }, + { + "page": "docs/build/entities", + "firstVersion": "1.6", + }, + { + "page": "docs/build/metrics-overview", + "firstVersion": "1.6", + }, + { + "page": "docs/build/cumulative", + "firstVersion": "1.6", + }, + { + "page": "docs/build/derived", + "firstVersion": "1.6", + }, + { + "page": "docs/build/measure-proxy", + "firstVersion": "1.6", + }, + { + "page": "docs/build/ratio", + "firstVersion": "1.6", + }, + { + "page": "reference/commands/clone", + "firstVersion": "1.6", + }, + { + "page": "docs/collaborate/govern/project-dependencies", + "firstVersion": "1.6", + }, + { + "page": "reference/dbt-jinja-functions/thread_id", + "firstVersion": "1.6", + }, + { + "page": "reference/resource-properties/deprecation_date", + "firstVersion": "1.6", + }, + { + "page": "reference/commands/retry", + "firstVersion": "1.6", + }, + { + "page": "docs/build/groups", + "firstVersion": "1.5", + }, + { + "page": "docs/collaborate/govern/model-contracts", + "firstVersion": "1.5", + }, + { + "page": "reference/commands/show", + "firstVersion": "1.5", + }, + { + "page": "docs/collaborate/govern/model-access", + "firstVersion": "1.5", + }, + { + "page": "docs/collaborate/govern/model-versions", + "firstVersion": "1.5", + }, + { + "page": "reference/programmatic-invocations", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-configs/contract", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-configs/group", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-properties/access", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-properties/constraints", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-properties/latest_version", + "firstVersion": "1.5", + }, + { + "page": "reference/resource-properties/versions", + "firstVersion": "1.5", + }, + { + "page": "reference/dbt-jinja-functions/local-md5", + "firstVersion": "1.4", + }, + { + "page": "reference/warehouse-setups/fal-setup", + "firstVersion": "1.3", + }, { "page": "reference/dbt-jinja-functions/set", "firstVersion": "1.2", @@ -38,20 +171,23 @@ exports.versionedPages = [ "page": "reference/dbt-jinja-functions/cross-database-macros", "firstVersion": "1.2", }, - { + { "page": "reference/resource-configs/grants", "firstVersion": "1.2", }, { - "page": "docs/contributing/testing-a-new-adapter", - "firstVersion": "1.1", - }, + "page": "docs/build/saved-queries", + "firstVersion": "1.7", + } +] + +exports.versionedCategories = [ { - "page": "reference/dbt-jinja-functions/selected_resources", - "firstVersion": "1.1", + "category": "Model governance", + "firstVersion": "1.5", }, { - "page": "reference/dbt-jinja-functions/print", - "firstVersion": "1.1", + "category": "Build your metrics", + "firstVersion": "1.6", } ] diff --git a/website/docs/community/contributing/contributing-coding.md b/website/docs/community/contributing/contributing-coding.md index 9157eb6b2ec..4c587fbd857 100644 --- a/website/docs/community/contributing/contributing-coding.md +++ b/website/docs/community/contributing/contributing-coding.md @@ -7,7 +7,7 @@ id: "contributing-coding" #### Overview -[dbt Packages](https://docs.getdbt.com/docs/building-a-dbt-project/package-management) are the easiest way for analytics engineers to get involved with contributing code to the dbt Community, because dbt Packages are just standard [dbt Projects](https://docs.getdbt.com/docs/building-a-dbt-project/projects). If you can create a dbt Project, write a macro, and ref a model: you can make a dbt Package. Packages function much like libraries do in other programming languages. They allow for prewritten, modularized development of code to solve common problems in analytics engineering. You can view all dbt Packages on the [dbt Package Hub](https://hub.getdbt.com/). +[dbt Packages](https://docs.getdbt.com/docs/build/packages) are the easiest way for analytics engineers to get involved with contributing code to the dbt Community, because dbt Packages are just standard [dbt Projects](/docs/build/projects). If you can create a dbt Project, write a macro, and ref a model: you can make a dbt Package. Packages function much like libraries do in other programming languages. They allow for prewritten, modularized development of code to solve common problems in analytics engineering. You can view all dbt Packages on the [dbt Package Hub](https://hub.getdbt.com/). #### Contribution opportunities @@ -48,7 +48,7 @@ There are three primary ways to contribute to the dbt OSS projects. We’ll use #### Get started -- Read the dbt Core [contribution guide](https://github.com/dbt-labs/dbt-core/blob/main/CONTRIBUTING.md) and the [Open Source Software Expectations](https://docs.getdbt.com/docs/contributing/oss-expectations). +- Read the dbt Core [contribution guide](https://github.com/dbt-labs/dbt-core/blob/main/CONTRIBUTING.md) and the [Open Source Software Expectations](/community/resources/oss-expectations). - If contributing to dbt Core, find an issue labeled “[good first issue](https://github.com/dbt-labs/dbt-core/issues?q=is%3Aopen+is%3Aissue+label%3Agood_first_issue)”, or look for similar labels on other repositories. If in doubt, also feel free to ask the maintainers for a good first issue, they’ll be excited to welcome you! #### Need help? diff --git a/website/docs/community/contributing/contributing-realtime-events.md b/website/docs/community/contributing/contributing-realtime-events.md index 36d1c9e2339..a648fc27787 100644 --- a/website/docs/community/contributing/contributing-realtime-events.md +++ b/website/docs/community/contributing/contributing-realtime-events.md @@ -7,11 +7,15 @@ id: "contributing-realtime-events" #### Overview -Meetups are a place to engage and interact with your fellow dbt Community members (in person when possible but sometimes online). We’ve got 21 Meetups in 16 countries. +Meetups are all about knowledge sharing; they are a place to connect and learn with your fellow dbt Community members. They usually take place in person, with some happening virtually. The Meetups take place across the globe, and you can check them all out [here](https://www.meetup.com/pro/dbt/). #### Contribution opportunities -Give a talk! Meetups are all about sharing your knowledge with other analytics practitioners. Have you recently solved a problem in your data organization, published a package or generally done something of interest to the dbt Community? Meet your local pals and share what you’ve done at a meetup. +- Become a Meetup organizer +- Speak at an event +- Sponsor an event or provide your office space + +For all of these opportunities, please fill out an [interest form](https://docs.google.com/forms/d/e/1FAIpQLScdzuz9Ouo1b07BMHveEBJsJ3rJAYuFvbTKep2fXDL0iZTZUg/viewform) and we will get back to you. #### Sample contributions @@ -22,7 +26,7 @@ Give a talk! Meetups are all about sharing your knowledge with other analytics p - Read [How to Deliver a Fantastic Meetup Talk](/community/resources/speaking-at-a-meetup). - Find a [Meetup near you](https://www.meetup.com/pro/dbt/), start attending and let the organizers know you are interested! - +ß ### Speak at Coalesce #### Overview diff --git a/website/docs/community/contributing/contributing-writing.md b/website/docs/community/contributing/contributing-writing.md index 86635e651c7..20527a6ae77 100644 --- a/website/docs/community/contributing/contributing-writing.md +++ b/website/docs/community/contributing/contributing-writing.md @@ -26,7 +26,7 @@ We appreciate these contributions because they contain context in the original p #### Get started - You can contribute to [docs.getdbt.com](http://docs.getdbt.com) by looking at our repository’s [README](https://github.com/dbt-labs/docs.getdbt.com#readme) or clicking **Edit this page** at the bottom of most pages at docs.getdbt.com. -- Read the [OSS Expectations](https://docs.getdbt.com/docs/contributing/oss-expectations). +- Read the [OSS Expectations](/community/resources/oss-expectations). - Find an issue labeled “[good first issue](https://github.com/dbt-labs/docs.getdbt.com/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22+).” - Need help: Visit #community-writers on the Community Slack or mention `@dbt-labs/product-docs` in a pull request or issue comment. diff --git a/website/docs/community/resources/code-of-conduct.md b/website/docs/community/resources/code-of-conduct.md index 6788f3ae39f..22159b36cc9 100644 --- a/website/docs/community/resources/code-of-conduct.md +++ b/website/docs/community/resources/code-of-conduct.md @@ -1,22 +1,16 @@ --- title: "Code of Conduct" id: "code-of-conduct" +description: "Learn about the community values that shape our rules, and review our anti-harassment policy." --- # dbt Community Code of Conduct -dbt has a supportive, active community of thousands of smart, kind, and helpful people who share a commitment to elevating the analytics profession. +This Code of Conduct applies to all dbt Community spaces, both online and offline. This includes Slack, Discourse, code repositories (dbt Core, dbt packages, etc.), Office Hours, and Meetups. Participants are responsible for knowing and abiding by this Code of Conduct. -You can get involved in the dbt community by connecting at [events](/community/events), getting or giving help in any of our many channels, contributing to dbt or a dbt package, and many other ways. - -People genuinely love this community, and we are committed to maintaining the spirit of it. As such have written this Code of Conduct to help all participants understand how to best participate in our community. - -The Code of Conduct applies to all dbt Community spaces both online and off. This includes: Slack, Discourse, code repositories (dbt Core, dbt packages etc), Office Hours and Meetups. There are some guidelines specific to particular forums (listed below). Participants are responsible for knowing and abiding by this Code of Conduct. - -This Code of Conduct has three sections: +This Code of Conduct has two sections: - **dbt Community Values:** These values apply to all of our community spaces, and all of our guidelines are based on these values. -- **Forum-specific guidelines**: These guidelines explain some of the cultural norms that apply to specific forums. - **Anti-harassment policy:** We are dedicated to providing a harassment-free experience for everyone in our community — here, we outline exactly what that means. We appreciate your support in continuing to build a community we’re all proud of. @@ -24,19 +18,16 @@ We appreciate your support in continuing to build a community we’re all proud — The dbt Community Admin Team. ## dbt Community Values +### Create more value than you capture. -### Be respectful. - -We want everyone to have a fulfilling and positive experience in the dbt Community and we are continuously grateful in your help ensuring that this is the case. - -Be courteous, respectful, and polite to fellow community members. Generally, don’t be a jerk. - -Be considerate of others’ time — many people in the community generously give their time for free. +Each community member should strive to create more value in the community than they capture. This is foundational to being a community. Ways to demonstrate this value: -- Take the time to write bug reports well ([example](https://github.com/fishtown-analytics/dbt/issues/2370)) -- Thank people if they help solve a problem. +- [Coding contributions](/community/contributing/contributing-coding): Contribute to dbt Core, a package, or an adapter. Beyond implementing new functionality, you can also open issues or participate in discussions. +- [Writing contributions](/community/contributing/contributing-writing): You can suggest edits to every page of the dbt documentation, or suggest a topic for the dbt Developer Blog. +- [Join in online](/community/contributing/contributing-online-community): Ask and answer questions on the Discourse forum, kick off a lively discussion in Slack, or even maintain a Slack channel of your own. +- [Participate in events](/community/contributing/contributing-realtime-events): Organise a community Meetup, speak at an event, or provide office space/sponsorship for an existing event. ### Be you. @@ -44,7 +35,8 @@ Some developer communities allow and even encourage anonymity — we prefer it w Ways to demonstrate this value: -- Update your profile on any dbt Community forums to include your name, and a clear picture. On Slack, use the “what I do” section to add your role title and current company +- Update your profile on dbt Community platforms to include your name and a clear picture of yourself. Where available, use the "what I do" section to add your role, title and current company. +- Join your `#local-` channel in Slack, or if it doesn't exist then propose a new one. - Write in your own voice, and offer your own advice, rather than speaking in your company’s marketing or support voice. ### Encourage diversity and participation. @@ -57,73 +49,19 @@ Ways to demonstrate this value: - Demonstrate empathy for a community member’s experience — not everyone comes from the same career background, so adjust answers accordingly. - If you are sourcing speakers for events, put in additional effort to find speakers from underrepresented groups. -### Create more value than you capture. - -Each community member should strive to create more value in the community than they capture. This is foundational to being a community. - -Ways to demonstrate this value: - -- Contribute to dbt or a dbt package -- Participate in discussions on Slack and Discourse -- Share things you have learned on Discourse -- Host events - -Be mindful that others may not want their image or name on social media, and when attending or hosting an in-person event, ask permission prior to posting about another person. - ### Be curious. -Always ask yourself “why?” and strive to be continually learning. +Always ask yourself "why?" and strive to be continually learning. Ways to demonstrate this value: -- Try solving a problem yourself before asking for help, e.g. rather than asking “what happens when I do X”, experiment and observe the results! -- When asking questions, explain the “why” behind your decisions, e.g. “I’m trying to solve X problem, by writing Y code. I’m getting Z problem” -- When helping someone else, explain why you chose that solution, or if no solution exists, elaborate on the reason for that, e.g. “That’s not possible in dbt today — but here’s a workaround / check out this GitHub issue for a relevant discussion” - -## Guidelines - -### Participating in Slack - -dbt Slack is where the dbt community hangs out, discusses issues, and troubleshoots problems together. It is not a support service — please do not treat it like one. - -We also have a number of cultural norms in our Slack community. You must read and agree to the rules before joining Slack, but you can also find them [here](/community/resources/slack-rules-of-the-road/). - -As a short summary: - -- [Rule 1: Be respectful](/community/resources/slack-rules-of-the-road/#rule-1-be-respectful) -- [Rule 2: Use the right channel](/community/resources/slack-rules-of-the-road/#rule-2-use-the-right-channel) -- [Rule 3: Put effort into your question](/community/resources/slack-rules-of-the-road/#rule-3-put-effort-into-your-question) -- [Rule 4: Do not double-post](/community/resources/slack-rules-of-the-road/#rule-4-do-not-double-post) -- [Rule 5: Keep it in public channels](/community/resources/slack-rules-of-the-road/#rule-5-keep-it-in-public-channels) -- [Rule 6: Do not solicit members of our Slack](/community/resources/slack-rules-of-the-road/#rule-6-do-not-solicit-members-of-our-slack) -- [Rule 7: Do not demand attention with @channel and @here, or by tagging individuals](/community/resources/slack-rules-of-the-road/#rule-7-do-not-demand-attention-with-channel-and-here-or-by-tagging-individuals) -- [Rule 8: Use threads](/community/resources/slack-rules-of-the-road/#rule-8-use-threads) - -### Vendor guidelines - -If you are a vendor (i.e. you represent an organization that sells a product or service relevant to our community), then there are additional guidelines you should be aware of. - -Most importantly — do not solicit members of our community as lead generation. You can find the rest of these [here](/community/resources/vendor-guidelines). - -### Guideline violations — 3 strikes method - -The point of our guidelines is not to find opportunities to punish people, but we do need a fair way to deal with people who do harm to our community. Violations related to our anti-harassment policy (below) will be addressed immediately and are not subject to 3 strikes. - -1. First occurrence: We’ll give you a friendly, but public, reminder that the behavior is inappropriate according to our guidelines. -2. Second occurrence: We’ll send you a private message with a warning that any additional violations will result in removal from the community. -3. Third occurrence: Depending on the violation, we might need to delete or ban your account. - -Notes: - -- Obvious spammers are banned on first occurrence. -- Participation in the dbt Community is a privilege — we reserve the right to remove people from the community. -- Violations are forgiven after 6 months of good behavior, and we won’t hold a grudge. -- People who are committing minor formatting / style infractions will get some education, rather than hammering them in the 3 strikes process. -- Contact conduct@getdbt.com to report abuse or appeal violations. In the case of appeals, we know that mistakes happen, and we’ll work with you to come up with a fair solution if there has been a misunderstanding. +- Try solving a problem yourself before asking for help, e.g. rather than asking "what happens when I do X", experiment and observe the results! +- When asking questions, explain the "why" behind your decisions, e.g. "I’m trying to solve X problem, by writing Y code. I’m getting Z problem" +- When helping someone else, explain why you chose that solution, or if no solution exists, elaborate on the reason for that, e.g. "That’s not possible in dbt today — but here’s a workaround / check out this GitHub issue for a relevant discussion" ## Anti-harassment policy -Further to our guidelines for participating in the community in a positive manner, we are also dedicated to providing a harassment-free experience for everyone. We do not tolerate harassment of participants in any form. +We are dedicated to providing a harassment-free experience for everyone. We do not tolerate harassment of participants in any form. Harassment includes: @@ -131,7 +69,7 @@ Harassment includes: - Unwelcome comments regarding a person’s lifestyle choices and practices, including those related to food, health, parenting, drugs, and employment. - Deliberate misgendering or use of ‘dead’ or rejected names. - Gratuitous or off-topic sexual images or behaviour in spaces where they’re not appropriate. -- Physical contact and simulated physical contact (eg, textual descriptions like “*hug*” or “*backrub*”) without consent or after a request to stop. +- Physical contact and simulated physical contact (eg, textual descriptions like "*hug*" or "*backrub*") without consent or after a request to stop. - Threats of violence. - Incitement of violence towards any individual, including encouraging a person to commit suicide or to engage in self-harm. - Deliberate intimidation. @@ -141,19 +79,21 @@ Harassment includes: - Unwelcome sexual attention. - Pattern of inappropriate social contact, such as requesting/assuming inappropriate levels of intimacy with others - Continued one-on-one communication after requests to cease. -- Deliberate “outing” of any aspect of a person’s identity without their consent except as necessary to protect vulnerable people from intentional abuse. +- Deliberate "outing" of any aspect of a person’s identity without their consent except as necessary to protect vulnerable people from intentional abuse. - Publication of non-harassing private communication. +Be mindful that others may not want their image or name on social media. Ask permission prior to posting about another person at in-person events. + The dbt Community prioritizes marginalized people’s safety over privileged people’s comfort. The dbt Community Admin team reserves the right not to act on complaints regarding: - ‘Reverse’ -isms, including ‘reverse racism,’ ‘reverse sexism,’ and ‘cisphobia’ -- Reasonable communication of boundaries, such as “leave me alone,” “go away,” or “I’m not discussing this with you.” +- Reasonable communication of boundaries, such as "leave me alone," "go away," or "I’m not discussing this with you." - Communicating in a ‘tone’ you don’t find congenial - Criticizing racist, sexist, cissexist, or otherwise oppressive behavior or assumptions ### Reporting harassment -If you are being harassed by a member of the dbt Community, notice that someone else is being harassed, or have any other concerns, please contact us at [community@dbtlabs.com](mailto:community@dbtlabs.com). +If you are being harassed by a member of the dbt Community, notice that someone else is being harassed, or have any other concerns, please contact us at [community@dbtlabs.com](mailto:community@dbtlabs.com) or use the workflows in [#moderation-and-administration](https://getdbt.slack.com/archives/C02JJ8N822H) on Slack. We will respect confidentiality requests for the purpose of protecting victims of abuse. At our discretion, we may publicly name a person about whom we’ve received harassment complaints, or privately warn third parties about them, if we believe that doing so will increase the safety of dbt community members or the general public. We will not name harassment victims without their affirmative consent. diff --git a/website/docs/community/resources/community-rules-of-the-road.md b/website/docs/community/resources/community-rules-of-the-road.md new file mode 100644 index 00000000000..12711b64c06 --- /dev/null +++ b/website/docs/community/resources/community-rules-of-the-road.md @@ -0,0 +1,70 @@ +--- +title: "dbt Community Rules of the Road" +id: "community-rules-of-the-road" +description: "This community is filled with smart, kind, and helpful people who share our commitment to elevating the analytics profession. These rules help everyone understand how to best participate." +--- + +As of June 2023, the dbt Community includes over 50,000 data professionals and is still growing. People genuinely love this community. It's filled with smart, kind, and helpful people who share our commitment to elevating the analytics profession. + +We are committed to maintaining the spirit of this community, and have written these rules alongside its members to help everyone understand how to best participate. We appreciate your support in continuing to build a community we're all proud of. + +## Expectations for all members +### Rule 1: Be respectful +We want everyone in this community to have a fulfilling and positive experience. Therefore, this first rule is serious and straightforward; we simply will not tolerate disrespectful behavior of any kind. + +Everyone interacting on a dbt platform – including Slack, the forum, codebase, issue trackers, and mailing lists – is expected to follow the [Community Code of Conduct](/community/resources/code-of-conduct). If you are unable to abide by the code of conduct set forth here, we encourage you not to participate in the community. + +### Rule 2: Keep it in public spaces +Unless you have someone's express permission to contact them directly, do not directly message other community members, whether on a dbt Community platform or other spaces like LinkedIn. + +We highly value the time community members put into helping each other, and we have precisely zero tolerance for people who abuse their access to experienced professionals. If you are being directly messaged with requests for assistance without your consent, let us know in the [#moderation-and-administration](https://getdbt.slack.com/archives/C02JJ8N822H) Slack channel. We will remove that person from the community. Your time and attention is valuable. + +### Rule 3: Follow messaging etiquette +In short: put effort into your question, use threads, post in the right channel, and do not seek extra attention by tagging individuals or double-posting. For more information, see our [guide on getting help](/community/resources/getting-help). + +### Rule 4: Do not solicit community members +This community is built for data practitioners to discuss the work that they do, the ideas that they have, and the things that they are learning. It is decidedly not intended to be lead generation for vendors or recruiters. + +Vendors and recruiters are subject to additional rules to ensure this space remains welcoming to everyone. These requirements are detailed below and are enforced vigorously. + +## Vendor expectations + +As a vendor/dbt partner, you are also a member of this community, and we encourage you to participate fully in the space. We have seen folks grow fantastic user relationships for their products when they come in with the mindset to share rather than pushing a pitch. At the same time, active community members have a finely honed sense of when they are being reduced to an audience or a resource to be monetized, and their response is reliably negative. + +:::info Who is a vendor? +Vendors are generally individuals belonging to companies that are creating products or services primarily targeted at data professionals, but this title also includes recruiters, investors, open source maintainers (with or without a paid offering), consultants and freelancers. If in doubt, err on the side of caution. +::: + +### Rule 1: Identify yourself +Include your company in your display name, e.g. "Alice (DataCo)". When joining a discussion about your product (after the waiting period below), be sure to note your business interests. + +### Rule 2: Let others speak first +If a community member asks a question about your product directly, or mentions that they have a problem that your product could help with, wait 1 business day before responding to allow other members to share their experiences and recommendations. (This doesn't apply to unambiguously support-style questions from existing users, or in your `#tools-` channel if you have one). + +### Rule 3: Keep promotional content to specified spaces +As a space for professional practice, the dbt Community is primarily a non-commercial space. However, as a service to community members who want to be able to keep up to date with the data industry, there are several areas available on the Community Slack for vendors to share promotional material: +- [#vendor-content](https://getdbt.slack.com/archives/C03B0Q4EBL3) +- [#events](https://getdbt.slack.com/archives/C80RCAZ5E) +- #tools-* (post in [#moderation-and-administration](https://getdbt.slack.com/archives/C02JJ8N822H) to request a channel for your tool/product) + +Recruiters may also post in [#jobs](https://getdbt.slack.com/archives/C7A7BARGT)/[#jobs-eu](https://getdbt.slack.com/archives/C04JMHHK6CD) but may not solicit applications in DMs. + +The definition of "vendor content" can be blurry at the edges, and we defer to members' instincts in these scenarios. As a rule, if something is hosted on a site controlled by that company or its employees (including platforms like Substack and Medium), or contains a CTA such as signing up for a mailing list or trial account, it will likely be considered promotional. + +### One more tip: Be yourself +Speak in your own voice, and join in any or all of the conversations that interest you. Share your expertise as a data professional. Make a meme if you're so inclined. Get in a (friendly) debate. You are not limited to only your company's products and services, and making yourself known as a familiar face outside of commercial contexts is one of the most effective ways of building trust with the community. Put another way, [create more value than you capture](/community/resources/code-of-conduct#create-more-value-than-you-capture). + +Because unaffiliated community members are able to share links in any channel, the most effective way to have your work reach a wider audience is to create things that are genuinely useful to the community. + + +## Handling violations + +The point of these rules is not to find opportunities to punish people, but to ensure the longevity of the community. Participation in this community is a privilege, and we reserve the right to remove people from it. + +To report an issue or appeal a judgement, email [community@dbtlabs.com](mailto:community@dbtlabs.com) or use the workflows in [#moderation-and-administration](https://getdbt.slack.com/archives/C02JJ8N822H) on Slack. + +Violations related to our anti-harassment policy will result in immediate removal. Other issues are handled in proportion to their impact, and may include: + +- a friendly, but public, reminder that the behavior is inappropriate according to our guidelines. +- a private message with a warning that any additional violations will result in removal from the community. +- temporary or permanent suspension of your account. diff --git a/website/docs/guides/legacy/getting-help.md b/website/docs/community/resources/getting-help.md similarity index 70% rename from website/docs/guides/legacy/getting-help.md rename to website/docs/community/resources/getting-help.md index 7be3a6a9c1b..5f423683014 100644 --- a/website/docs/guides/legacy/getting-help.md +++ b/website/docs/community/resources/getting-help.md @@ -7,20 +7,18 @@ dbt is open source, and has a generous community behind it. Asking questions wel ### 1. Try to solve your problem first before asking for help #### Search the existing documentation -The docs site you're on is highly searchable, make sure to explore for the answer here as a first step. If you're new to dbt, try working through the [Getting Started guide](/docs/get-started/getting-started/overview) first to get a firm foundation on the essential concepts. +The docs site you're on is highly searchable, make sure to explore for the answer here as a first step. If you're new to dbt, try working through the [quickstart guide](/quickstarts) first to get a firm foundation on the essential concepts. #### Try to debug the issue yourself -We have a handy guide on [debugging errors](debugging-errors) to help out! This guide also helps explain why errors occur, and which docs you might need to search for help. +We have a handy guide on [debugging errors](/guides/best-practices/debugging-errors) to help out! This guide also helps explain why errors occur, and which docs you might need to search for help. #### Search for answers using your favorite search engine -We're committed to making more errors searchable, so it's worth checking if there's a solution already out there! Further, some errors related to installing dbt, the SQL in your models, or getting yaml right, are errors that are not-specific to dbt, so there may be other resources to cehck. +We're committed to making more errors searchable, so it's worth checking if there's a solution already out there! Further, some errors related to installing dbt, the SQL in your models, or getting YAML right, are errors that are not-specific to dbt, so there may be other resources to check. #### Experiment! If the question you have is "What happens when I do `X`", try doing `X` and see what happens! Assuming you have a solid dev environment set up, making mistakes in development won't affect your end users ### 2. Take a few minutes to formulate your question well Explaining the problems you are facing clearly will help others help you. - #### Include relevant details in your question Include exactly what's going wrong! When asking your question, you should: @@ -37,19 +35,21 @@ In general, people are much more willing to help when they know you've already g #### Share the context of the problem you're trying to solve Sometimes you might hit a boundary of dbt because you're trying to use it in a way that doesn't align with the opinions we've built into dbt. By sharing the context of the problem you're trying to solve, we might be able to share insight into whether there's an alternative way to think about it. +#### Post a single message and use threads +The dbt Slack's culture revolves around threads. When posting a message, try drafting it to yourself first to make sure you have included all the context. Include big code blocks in a thread to avoid overwhelming the channel. + +#### Don't tag individuals to demand help +If someone feels inclined to answer your question, they will do so. We are a community of volunteers, and we're generally pretty responsive and helpful! If nobody has replied to your question, consider if you've asked a question that helps us understand your problem. If you require in-depth, ongoing assistance, we have a wonderful group of experienced dbt consultants in our ecosystem. You can find a full list [below](#receiving-dedicated-support). + + ### 3. Choose the right medium for your question We use a number of different mediums to share information - If your question is roughly "I've hit this error and am stuck", please ask it on [the dbt Community Forum](https://discourse.getdbt.com). - If you think you've found a bug, please report it on the relevant GitHub repo (e.g. [dbt repo](https://github.com/dbt-labs/dbt), [dbt-utils repo](https://github.com/dbt-labs/dbt-utils)) -- If you are looking for an opinionated answer (e.g. "What's the best approach to X?", "Why is Y done this way?"), then, feel free to join our [Slack community](https://community.getdbt.com/) and ask it in the correct channel: - * **#advice-dbt-for-beginners:** A great channel if you're getting started with dbt and want to understand how it works. - * **#advice-dbt-for-power-users:** If you’re hitting an error in dbt that you don’t understand, let us know here. - * **#advice-data-modeling:** This channel is most useful when wanting to ask questions about data model design, SQL patterns, and testing. - * **#dbt-suggestions:** Got an idea for dbt? This is the place! - * Other channels: We're adding new channels all the time — please take a moment to browse the channels to see if there is a better fit +- If you are looking for a more wide-ranging conversation (e.g. "What's the best approach to X?", "Why is Y done this way?"), join our [Slack community](https://getdbt.com/community). Channels are consistently named with prefixes to aid discoverability. ## Receiving dedicated support -If you need dedicated support to build your dbt project, consider reaching out regarding [professional services](https://www.getdbt.com/contact/), or engaging one of our [consulting partners](https://www.getdbt.com/ecosystem/). +If you need dedicated support to build your dbt project, consider reaching out regarding [professional services](https://www.getdbt.com/contact/), or engaging one of our [consulting partners](https://partners.getdbt.com/english/directory/). ## dbt Training If you want to receive dbt training, check out our [dbt Learn](https://learn.getdbt.com/) program. @@ -60,14 +60,4 @@ If you want to receive dbt training, check out our [dbt Learn](https://learn.get - Billing - Bug reports related to the web interface -As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process rather than reaching out to support. - - +As a rule of thumb, if you are using dbt Cloud, but your problem is related to code within your dbt project, then please follow the above process rather than reaching out to support. \ No newline at end of file diff --git a/website/docs/community/resources/jobs-terms-and-conditions.md b/website/docs/community/resources/jobs-terms-and-conditions.md new file mode 100644 index 00000000000..f2f2134f847 --- /dev/null +++ b/website/docs/community/resources/jobs-terms-and-conditions.md @@ -0,0 +1,16 @@ +--- +title: "dbt Labs Community #jobs Channels Terms and Conditions" +id: "jobs-terms-and-conditions" +description: "Before posting a job in the dbt Community or submitting an application, review these terms and conditions." +--- + +I agree to abide by the [dbt Community Code of Conduct](community/resources/code-of-conduct) and all laws applicable to me in my use of the dbt Community's #jobs channels. I further agree: + +- dbt Labs is not responsible for not does it warrant or guarantee the validity, accuracy, completeness, legality, or reliability of any functionality of any #jobs channel, any posting's content, or any application and/or solicitation of any kind of employment. +- dbt Labs does not review and approve job-related content. +- dbt Labs disclaims liability of any kind whatsoever for any type of damage that occurs while using the community Slack for job-related reasons, and I waive any type of claim (including actual, special or consequential damages) to the maximum extent permitted by law. +- Without limitation, dbt Labs disclaims liability for quality, performance, merchantability, and fitness for a particular purpose, express or implied, that may arise out of my use of the community Slack for job-related content, my reliance on such information, and/or my provision/receipt of job-related information. +- I understand that no internet-based site is without risk, and my use is at my own risk. +- My use of any job-posting template (or other forum for providing job-related information) confirms my consent to provide the data posted, confirms that I have permission to post such data, and is subject to the terms of the [dbt Labs privacy policy](https://www.getdbt.com/cloud/privacy-policy). + +For further information, please contact [legal@dbtlabs.com](mailto:legal@dbtlabs.com). diff --git a/website/docs/community/resources/maintaining-a-channel.md b/website/docs/community/resources/maintaining-a-channel.md index 1961686d4a2..289fa389e80 100644 --- a/website/docs/community/resources/maintaining-a-channel.md +++ b/website/docs/community/resources/maintaining-a-channel.md @@ -1,6 +1,7 @@ --- title: "Maintaining a Slack channel" id: "maintaining-a-channel" +description: There are three things you should do to be a good channel maintainer in the dbt Slack community --- ## TL;DR @@ -9,7 +10,7 @@ There are three things you should do to be a good channel maintainer in the [dbt - Once you see some folks in the channel, post initial conversation topics to help them get to know each other. - Keep an eye out in #introductions for folks who might benefit from your new channel. For example if someone works in the space or on the same problems, then direct them to the channel. -- Make sure folks follow the [Rules of the Road](https://docs.getdbt.com/docs/contributing/slack-rules-of-the-road) in the channel. If you notice someone is not following one, gently remind them of the rule in thread, and, ideally, provide an example of how they can rephrase their message or where they can redirect it. If you have a question about how to proceed, just post it in #ask-a-moderator with a screenshot or link to the thread and someone will give you advice. +- Make sure folks follow the [Rules of the Road](https://docs.getdbt.com/docs/contributing/slack-rules-of-the-road) in the channel. If you notice someone is not following one, gently remind them of the rule in thread, and, ideally, provide an example of how they can rephrase their message or where they can redirect it. If you have a question about how to proceed, just post it in #moderation-and-administration with a screenshot or link to the thread and someone will give you advice. ## Scope of the role @@ -17,8 +18,8 @@ A maintainer can be a dbt Labs employee, but does not have to be. *Slack channel ## Initial instructions -1. Review the [Rules of the Road](community/resources/slack-rules-of-the-road) and [Code of Conduct](community/resources/code-of-conduct) and please let the the folks who created the channel know that you read both documents and you agree to be mindful of them. -2. If you are a vendor, review the [Vendor Guidelines](https://www.getdbt.com/community/vendor-guidelines). +1. Review the [Rules of the Road](community/resources/community-rules-of-the-road) and [Code of Conduct](community/resources/code-of-conduct) and please let the the folks who created the channel know that you read both documents and you agree to be mindful of them. +2. If you are a vendor, review the [Vendor Expectations](community/resources/community-rules-of-the-road#vendor-expectations). 3. Add the Topic and Description to the channel. @Mention your name in the channel Description, identifying yourself as the maintainer. Ex: *Maintainer: First Last (pronouns).* If you are a vendor, make sure your Handle contains your affiliation. 4. Complete or update your Slack profile by making sure your Company (in the ‘What I do’ field), Pronouns, and Handle, if you’re a vendor, are up-to-date. 5. Post initial conversation topics once a few folks get in the channel to help folks get to know each other. Check out this [example introductory post](https://getdbt.slack.com/archives/C02FXAZRRDW/p1632407767005000). @@ -27,11 +28,11 @@ A maintainer can be a dbt Labs employee, but does not have to be. *Slack channel ## Long-term expectations - Maintaining the channel, checking in and being active on a regular basis by answering folks' questions, and seeding discussions. Want an example? Check out [this poll](https://getdbt.slack.com/archives/C022A67TLFL/p1628279819038800). -- For guidance on how to answer a question, see [Answering Community Questions](https://www.getdbt.com/community/answering-community-questions). If you are not sure how to answer a lingering or unanswered question, you can post about it in #ask-a-moderator or direct it to another channel, if relevant. +- For guidance on how to answer a question, see [Answering Community Questions](https://www.getdbt.com/community/answering-community-questions). If you are not sure how to answer a lingering or unanswered question, you can post about it in #moderation-and-administration or direct it to another channel, if relevant. - If the channel is an industry channel, it’s helpful to monitor [#introductions](https://getdbt.slack.com/archives/CETJLH1V3) and invite people. Keep an eye out for folks who might benefit from being in the new channel if they mention they are working in the space, or are thinking about some of these problems. -- Make sure folks follow the [Rules of the Road](https://docs.getdbt.com/docs/contributing/slack-rules-of-the-road). For example, if you notice someone is not following one, gently remind them of the rule in thread, and, ideally, provide an example of how they can rephrase their message or where they can redirect it. If you have a question about how to proceed, just post about it in #ask-a-moderator with a link to the thread or screenshot and someone will give you advice. +- Make sure folks follow the [Rules of the Road](https://docs.getdbt.com/docs/contributing/slack-rules-of-the-road). For example, if you notice someone is not following one, gently remind them of the rule in thread, and, ideally, provide an example of how they can rephrase their message or where they can redirect it. If you have a question about how to proceed, just post about it in #moderation-and-administration with a link to the thread or screenshot and someone will give you advice. - In tools channels, sharing customer stories and product updates is very okay in this channel because folks expect that when they join. However, please avoid any direct sales campaigns, pricing offers, etc. -- If you have any questions/doubts about the [Rules of the Road](/community/resources/slack-rules-of-the-road) or [Vendor Guidelines](/community/resources/vendor-guidelines), please post a question in #ask-a-moderator about what sort of things the community expects from interactions with vendors. +- If you have any questions/doubts about the [Rules of the Road and Vendor Expectations](/community/resources/community-rules-of-the-road), please post a question in #moderation-and-administration about what sort of things the community expects from interactions with vendors. - A reminder that we never DM anyone in Slack without their permission in public channel or some prior relationship. - A reminder that @ here/all/channel are disabled. - Use and encourage the use of threads 🧵 to keep conversations tidy! diff --git a/website/docs/community/resources/oss-expectations.md b/website/docs/community/resources/oss-expectations.md index 66880bc63b7..9c916de1240 100644 --- a/website/docs/community/resources/oss-expectations.md +++ b/website/docs/community/resources/oss-expectations.md @@ -4,7 +4,7 @@ title: "Expectations for OSS contributors" Whether it's a dbt package, a plugin, `dbt-core`, or this very documentation site, contributing to the open source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and to give back to the community. The goal of this page is to help you understand what to expect when contributing to dbt open source software (OSS). While we can only speak for our own experience as open source maintainers, many of these guidelines apply when contributing to other open source projects, too. -Have you seen things in other OSS projects that you quite like, and think we could learn from? [Open a discussion on the Developer Hub](https://github.com/dbt-labs/docs.getdbt.com/discussions/new), or start a conversation in the dbt Community Slack (for example: `#community-strategy`, `#dbt-core-development`, `#package-ecosystem`, `#adapter-ecosystem`). We always appreciate hearing from you! +Have you seen things in other OSS projects that you quite like, and think we could learn from? [Open a discussion on the dbt Community Forum](https://discourse.getdbt.com), or start a conversation in the dbt Community Slack (for example: `#community-strategy`, `#dbt-core-development`, `#package-ecosystem`, `#adapter-ecosystem`). We always appreciate hearing from you! ## Principles @@ -51,7 +51,7 @@ An issue could be a bug you’ve identified while using the product or reading t ### Best practices for issues -- Issues are **not** for support / troubleshooting / debugging help. Please [open a discussion on the Developer Hub](https://github.com/dbt-labs/docs.getdbt.com/discussions/new), so other future users can find and read proposed solutions. If you need help formulating your question, you can post in the `#advice-dbt-for-beginners` channel in the [dbt Community Slack](https://www.getdbt.com/community/). +- Issues are **not** for support / troubleshooting / debugging help. Please [open a discussion on the dbt Community Forum](https://discourse.getdbt.com), so other future users can find and read proposed solutions. If you need help formulating your question, you can post in the `#advice-dbt-help` channel in the [dbt Community Slack](https://www.getdbt.com/community/). - Always search existing issues first, to see if someone else had the same idea / found the same bug you did. - Many repositories offer templates for creating issues, such as when reporting a bug or requesting a new feature. If available, please select the relevant template and fill it out to the best of your ability. This will help other people understand your issue and respond. @@ -82,8 +82,8 @@ In some cases, the right resolution to an open issue might be tangential to the | `triage` | This is a new issue which has not yet been reviewed by a maintainer. This label is removed when a maintainer reviews and responds to the issue. | | `bug` | This issue represents a defect or regression from the behavior that's documented, or that you reasonably expect | | `enhancement` | This issue represents net-new functionality, including an extension of an existing capability | -| `good first issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | -| `help wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | +| `good_first_issue` | This issue does not require deep knowledge of the codebase to implement. This issue is appropriate for a first-time contributor. | +| `help_wanted` | This issue is trickier than a "good first issue." The required changes are scattered across the codebase, or more difficult to test. The maintainers are happy to help an experienced community contributor; they aren't planning to prioritize this issue themselves. | | `duplicate` | This issue is functionally identical to another open issue. The maintainers will close this issue and encourage community members to focus conversation on the other one. | | `stale` | This is an old issue which has not recently been updated. In repositories with a lot of activity, stale issues will periodically be closed. | | `wontfix` | This issue does not require a code change in the repository, or the maintainers are unwilling to merge a change which implements the proposed behavior. | diff --git a/website/docs/community/resources/slack-rules-of-the-road.md b/website/docs/community/resources/slack-rules-of-the-road.md deleted file mode 100644 index 27774f9e35e..00000000000 --- a/website/docs/community/resources/slack-rules-of-the-road.md +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "dbt Slack: Rules of the Road" -id: "slack-rules-of-the-road" ---- - -As of October 2022, the dbt Slack community includes 35,000+ data professionals and is growing month-over-month. People genuinely love this community. It’s filled with smart, kind, and helpful people who share our commitment to elevating the analytics profession. - -We are committed to maintaining the spirit of this community, and as such have written these rules to help new members understand how to best participate in our community. - -We appreciate your support in continuing to build a community we’re all proud of. - -## Rule 1: Be respectful -We want everyone to have a fulfilling and positive experience in dbt Slack and we are continuously grateful in your help ensuring that this is the case. - -The guidelines that follow are important, but transgressions around Slack etiquette are forgivable. This first rule, however, is serious -- we simply will not tolerate disrespectful behavior of any kind. - -Everyone interacting in dbt Slack, codebase, issue trackers, and mailing lists are expected to follow the [PyPA Code of Conduct](https://www.pypa.io/en/latest/code-of-conduct/). If you are unable to abide by the code of conduct set forth here, we encourage you not to participate in the community. - -## Rule 2: Use the right channel -It’s important that we make it possible for members of the community to opt-in to various types of conversations. Our different Slack channels specifically exist for this purpose. Our members do a wonderful job at making sure messages are posted in the most relevant channel, and you’ll frequently see people (respectfully!) reminding each other about where to post messages. Here's a guide to our channels: -- If you're new to dbt and unsure where something belongs, feel free to post in **#advice-dbt-for-beginners** - we'll be able to direct you to the right place -- **For job postings, use #jobs**. If you post a job description outside of #jobs, we will delete it and send you a link to this rule. -- For database-specific questions, use **#db-snowflake**, **#db-bigquery**, **#db-redshift**, or similar. -- For questions about data modeling or for SQL help, use **#modeling** -- For conversations unrelated to dbt or analytics, consider if dbt Slack is an appropriate medium for the conversation. If so, use **#memes-and-off-topic-chatter**. - -If you're hitting an error, you should post your question in [the Community Forum](https://discourse.getdbt.com) instead. - -## Rule 3: Put effort into your question -dbt Slack is a community of volunteers. These are kind, knowledgeable, helpful people who share their time and expertise for free. - -A thoughtful and well-researched post will garner far more responses than a low-effort one. See the guide on [getting help](getting-help) for more information about how to ask a good question. - -## Rule 4: Mark your questions as resolved -Were you in need of help, and received a helpful reply? Please mark your question as resolved by adding a ✅ reaction to your original post. Note that other community members may summon Slackbot to remind you to do this, by posting the words `resolved bot` as a reply to your message. - -## Rule 5: Do not double-post -Our best members are respectful of peoples’ time. We understand that even though a question feels urgent, dbt Slack is not a customer service platform, it is a community of volunteers. - -The majority of questions in dbt Slack get answered, though you may need to wait a bit. If you’re not getting a response, please do not post the same question to multiple channels (we’ll delete your messages and send you a link to this page). Instead, review your question and see if you can rewrite it better to make it easier for someone to answer quickly. - -## Rule 6: Keep it in public channels -Unless you have someone’s express permission to contact them directly, **do not directly message members of this community to solicit help, sell a product, or recruit for a role**. - -We highly value the time community members put into helping each other, and we have precisely zero tolerance for people who abuse their access to experienced professionals. If you are being directly messaged by members of the community asking for assistance without your consent, let us know. We will remove that person from the community. Your time and attention is valuable. - -## Rule 7: Do not solicit members of our Slack -This community is built for data practitioners to discuss the work that they do, the ideas that they have, and the things that they are learning. It is decidedly not intended to be lead generation for vendors or recruiters. - -**Do not pitch your products or services in dbt Slack**: this isn't the right place for that. Vendors can add enormous value to the community by being there to answer questions about their products when questions arise. - -Further, **do not use our Slack community for outbound recruitment for a role**. Recruiters should feel free to post opportunities in the #jobs channel, but should not directly contact members about an opportunity. - -We appreciate when vendors and recruiters identify themselves clearly in their Slack username. If you see someone pitching products and services in dbt Slack, or contact you directly about an open role, let us know. We’ll delete the message and remind that person about this rule. - -## Rule 8: Do not demand attention with @channel and @here, or by tagging individuals -The @channel and @here keywords in Slack are disabled for everyone except admins. If you make a post containing @channel or @here, nothing will happen. Still, we'll send you a link to this rule to help you better understand how dbt Slack operates. - -Do not tag individuals for in-depth assistance in your questions. If someone feels inclined to answer your question, they will do so. We are a community of volunteers, and we're generally pretty responsive and helpful! If nobody has replied to your question, consider if you've asked a question that helps us understand your problem. If you require in-depth, ongoing assistance, we have a wonderful group of experienced dbt consultants in our ecosystem. You can find a full list [here](https://www.getdbt.com/ecosystem/). - -## Rule 9: Use threads -The best way to keep conversations coherent in Slack is to use threads. The dbt Slack community uses threads heavily and if you break this convention, a member of the community will let you know. - -Here are some guidelines on how to use threads effectively: -* Type your question out as one message rather than separate messages (Pro Tip: Write a first draft of your question as a direct message to yourself) -* Leverage Slack's edit functionality if you realize you forgot to add something to your question rather than adding new messages. -* If you see a conversation taking place across discrete messages, send over a link to this rule. diff --git a/website/docs/community/resources/vendor-guidelines.md b/website/docs/community/resources/vendor-guidelines.md deleted file mode 100644 index 1b6bb6c9511..00000000000 --- a/website/docs/community/resources/vendor-guidelines.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: "Vendor guidelines" -id: "vendor-guidelines" ---- - -# Engaging in the dbt Community as a Vendor - -A key aspect that makes dbt stand out from other tools is the dbt Community. -This community was built to drive our mission statement of empowering analysts. -This includes advancing the field of analytics engineering practices. -We are creating spaces where folks can learn from each other, share best practices, -discover what it means to use software engineering workflows, and so on. - -The dbt community extends far beyond what happens in dbt Slack. There are regular meetups, -blog posts, and even a conference! Our North Star is to extend the knowledge loop; -we are a community, not an audience. - -Our community members expect a thoughtful space full of kind, curious, and bright individuals. -They contribute to the knowledge loop with their own expertise and benefit from the relevant knowledge brought to the table by other community experts (including vendors). -Along those lines, **we value diversity and inclusion**. -We seek to amplify underrepresented communities and have no tolerance for anyone who is disrespectful in this space. - -As a vendor/dbt partner, you are also a member of this community, one that we want -and deeply encourage to share your expertise in tooling, analytics, etc. -Our community members are truly open to discovering and discussing innovative solutions and tools. -We have seen folks grow fantastic user relationships for their products when they come in with the mindset to share rather than pushing a pitch. - -To guide you on your community journey, we have created this document for you to read and share with your coworkers. -By following these guidelines, you will help us maintain this community as well as gain -full access to all the benefits that this community can provide. - - -## Dos & Don'ts for dbt Slack - -### Dos -- **Read the Rules of The Road.** These rules are the best ways to participate in our community. -- **Fill your profile!** We want to get to know you so do upload a picture of yourself and add your company in your name (e.g. "Alice (DataCo)"). Be sure to include your company in your profile so folks know that you work for a vendor -- **Introduce Yourself in #introductions.** Tell us about yourself! -- **Be helpful.** We encourage folks to answer questions and offer their product expertise to conversations already in motion. You can even invite folks to chat in DMs if anyone wants more info about your product. But be sure you identify yourself and your business interests in thread. -- **Be yourself when posting, speak in your own voice.** -- **Participate in all the conversations that interest you.** Make a meme if you’re so inclined. Get in a (friendly) debate. You are not limited to only your company's products and services. -- **Post with intention.** If you have a link or product update that is appropriate to share, give context. - -### Don'ts -- **Do not do 1:1 outbound.** Only initiate DMs if you’ve received active confirmation in a public channel that a DM would be welcome. -- **Do not be anonymous.** Folks who identify themselves clearly are able to build empathy and form genuine relationships much easier. This is what we want for the community. -- Spam channels with Marketing material. -- **Do not post without context.** Posts that include context outside of just the pitch are the ones that add value to our community. - - -## Summary - -This community is centered around feeding into the knowledge loop. It’s a place intended for building genuine, helpful connections. We found that most vendors find success in our space by leading with this intention. - -Here are some ways you can contribute to the community: - -- contribute to the dbt core repository -- write dbt packages -- write other public content (blog posts, case studies, etc.) -- respond to questions on slack / discourse -- host events -- promote / respond to content written by community members -- Partner up with community members on blog posts/code/etc. - -For more information on the thought behind our community, especially if you are interested in creating your own, feel free to -reach out to our community managers. diff --git a/website/docs/community/spotlight/alan-cruickshank.md b/website/docs/community/spotlight/alan-cruickshank.md new file mode 100644 index 00000000000..74ef95a2b61 --- /dev/null +++ b/website/docs/community/spotlight/alan-cruickshank.md @@ -0,0 +1,43 @@ +--- +id: alan-cruickshank +title: Alan Cruickshank +description: | + I've been around in the dbt community, especially the London dbt Meetup, since early 2019—around the time that we started using dbt at tails.com. My background is the startup/scaleup space and building data teams in a context where there is a lot of growth going on but there isn't a lot of money around to support that. That's a topic that I've written and spoken about on several occasions on podcasts, blogposts and even at Coalesce 2020 and 2021! + + Aside from my work at tails.com, my other main focus at the moment is SQLFluff, the open source SQL linter which I started developing as part of a hackday at tails.com in late 2019 and now is the most starred SQL linter on Github with almost 1M downloads a month. +image: /img/community/spotlight/alan-cruickshank.jpg +pronouns: he/him +location: London, UK +jobTitle: Insights Director +companyName: tails.com +organization: Author & Maintainer of SQLFluff +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/amcruickshank/ + - name: SQLFluff + link: https://sqlfluff.com +dateCreated: 2023-06-30 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I [joined the community](https://www.getdbt.com/community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2019 and it's been an invaluable source of advice and wisdom, especially operating on the bleeding edge of open source data tooling. It's been a place to meet like-minded people, even find new colleagues and certainly one of the places I look to when thinking about how to approach hairy data problems. + +In London it's also been one of the most vibrant meetup groups in person, compared to many others which are either very, very specialized or more focussed on larger organisations. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I just want to be useful 😁. I've learned a lot from the community over the years, and now I want to be able to give back to it. My primary vehicle for that is SQLFluff - both as a tool for the community to use, but also as a way of encouraging a wider group of people to feel welcome and able to contribute to open source software and build the tools of the future. + +I also see SQLFluff as a vehicle to drive more consistency in the way we write SQL, and through that drive better communication and lower the barrier for new people to enter this field and find their own success. + +## What have you learned from community members? What do you hope others can learn from you? + +For better or worse, I spend most of my day job on people and organisational things, less on how to solve individual problems, and more on how to enable and support groups of people in being able to make great decisions themselves. In some ways, if I have to touch the keyboard too much, it's a sign that I've failed in that calling. dbt itself is a tool which enables better collaboration—and the community is full of people with great ideas on how to better enable other people around us. I hope that I'm able to pass some of that knowledge and the experience of applying it in a scaleup environment back to others also treading this path. + +More specifically from the dbt community, if I were to pick one recommendation, it would be Emilie Schario’s talk from Coalesce 2022 on [“Data Led is Dumb”](https://www.youtube.com/watch?v=WsMHPALc8Vg&t=1s). I think should be essential watching for anyone who’s hearing “Data Led” a lot, and wants to turn that excitement into practical action. + +## Anything else interesting you want to tell us? + +If you're not using SQLFluff on your dbt project, you probably should be: https://github.com/sqlfluff/sqlfluff diff --git a/website/docs/community/spotlight/anya-prosvetova.md b/website/docs/community/spotlight/anya-prosvetova.md new file mode 100644 index 00000000000..521f87f4122 --- /dev/null +++ b/website/docs/community/spotlight/anya-prosvetova.md @@ -0,0 +1,35 @@ +--- +id: anya-prosvetova +title: Anya Prosvetova +description: | + I’m a Data Engineer with a background in SaaS, consulting, financial services and the creative industries. I help organisations convert data into value, developing data pipelines and automating processes. I’m also a Tableau Visionary and DataDev Ambassador, and one of the organisers of Data + Women Netherlands community. I became an active member of the dbt Community about a year ago, and it was a great place to learn and ask questions. And it was really inspiring to speak at the first Amsterdam dbt Meetup recently, and meet the local community of fellow Analytics and Data Engineers. +image: /img/community/spotlight/anya-prosvetova.jpg +pronouns: she/her +location: Amsterdam, Netherlands +jobTitle: Senior Data Engineer +companyName: Aimpoint Digital +organization: Tableau Vizionary & DataDev Ambassador +socialLinks: + - name: Twitter + link: https://www.twitter.com/anyalitica + - name: LinkedIn + link: https://uk.linkedin.com/in/annaprosvetova + - name: Website + link: https://prosvetova.com +dateCreated: 2023-03-29 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +A few years back, I became a member of the dbt Community, but it wasn't until about a year ago, when I started using dbt at work, that I began actively engaging with it. Being the only data person in my company, the Community became a valuable resource for me to learn and ask questions. It's an excellent platform to gain insights from others, exchange experiences, and stay up-to-date with the latest product features. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +What I enjoy about the dbt Community is that its thought leaders are focused on working together to create a culture of mutual support and shared learning. Everyone is welcome to ask a question or share their latest blog without the fear of being judged. I believe that everyone has something valuable to contribute to the community, and I hope to help facilitate this supportive and collaborative environment where we can all learn from each other. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned a lot about best practices for working with dbt and data in general, as well as tips and tricks for specific use cases. I've also gained a better understanding of the diverse range of data challenges that people face in different industries and contexts. + +As for what I hope others can learn from me, I aim to share my own experiences and knowledge in a way that is approachable and useful to people at all skill levels and backgrounds. diff --git a/website/docs/community/spotlight/bruno-de-lima.md b/website/docs/community/spotlight/bruno-de-lima.md new file mode 100644 index 00000000000..7f40f66859c --- /dev/null +++ b/website/docs/community/spotlight/bruno-de-lima.md @@ -0,0 +1,53 @@ +--- +id: bruno-de-lima +title: Bruno de Lima +description: | + I am an Analytics Engineer and aspiring tech writer coming from an academic engineering background. + + I worked at Indicium as an Analytics Engineer for more than a year, having worked with dbt (of course, every day) for transformation; BigQuery, Snowflake, and Databricks as data warehouses; Power BI and Tableau for BI; and Airflow for orchestration. + + I actively participate in the dbt community, having attended two dbt meetups in Brazil organized by Indicium; writing about dbt-related topics in my Medium and LinkedIn profiles; contributing to the code; and frequently checking dbt Slack and Discourse, helping (and being helped by) other dbt practitioners. If you are a community member, you may have seen me around! +image: /img/community/spotlight/bruno-de-lima.jpg +pronouns: he/him +location: Florianópolis, Brazil +jobTitle: Data Engineer +companyName: phData +organization: "" +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/brunoszdl/ + - name: Medium + link: https://medium.com/@bruno.szdl +dateCreated: 2023-03-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I was not truly happy with my academic life. My career took a new turn when I enrolled in the Analytics Engineer course by Indicium. That was my first contact with dbt, and I didn't realize how much it would transform my career. After that, I was hired at the company as an Analytics Engineer and worked extensively with dbt from day one. + +It took me some time to become an active member of the dbt community. I started working with dbt at the beginning of 2022 and became more involved towards the end of that year, encouraged by Daniel Avancini. I regret not doing this earlier, because being an active community member has been a game-changer for me, as my knowledge of dbt has grown exponentially just by participating in daily discussions on Slack. I have found #advice-dbt-help and #advice-dbt-for-power-users channels particularly useful, as well as the various database-specific channels. Additionally, the #i-made-this and #i-read-this channels have allowed me to learn about the innovative things that community members are doing. + +Inspired by other members, especially Josh Devlin and Owen Prough, I began answering questions on Slack and Discourse. For questions I couldn't answer, I would try engaging in discussions about possible solutions or provide useful links. I also started posting dbt tips on LinkedIn to help practitioners learn about new features or to refresh their memories about existing ones. + +By being more involved in the community, I felt more connected and supported. I received help from other members, and now, I could help others, too. I was happy with this arrangement, but more unexpected surprises came my way. My active participation in Slack, discourse, and LinkedIn opened doors to new connections and career opportunities. I had the pleasure of meeting a lot of incredible people and receiving exciting job offers. + +Thanks to the dbt community, I went from feeling uncertain about my career prospects to having a solid career and being surrounded by incredible people. + +I would like to thank the Indicium folks for opening the first door for me for this career in data, and not just for me but for lots of people in Brazil trying to migrate from different fields who would not have this opportunity otherwise. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with Gwen Windflower and Joel Labes, or at least they are the kind of leader I admire. Their strong presence and continuous interaction with all types of dbt enthusiasts make everyone feel welcomed in the community. They uplift those who contribute to the community, whether it's through a LinkedIn post or answering a question, and provide constructive feedback to help them improve. And of course they show a very strong knowledge about dbt and data in general, which is reflected in their contributions. + +And that is how I aspire to grow as a leader in the dbt Community. Despite of being an introvert, I like interacting with people, helping solve problems and providing suggestions. Recognizing and acknowledging the achievements of others is also important to me, as it fosters a positive environment where everyone's contributions are valued. And I am continuously learning about dbt to improve my skills, and to become a trustworthy reference for others to rely on. + +## What have you learned from community members? What do you hope others can learn from you? + +I have learned that regardless of one's level of expertise, each person's voice is valued and respected in the community. I have also learned the importance of helping others and thinking critically, not just answering questions, but assuring that is the right question. By actively engaging with others, sharing knowledge and insights, we can collectively improve our understanding and use of dbt. Moreover, I have discovered that having fun with dbt and fostering a positive, supportive community culture can greatly enhance the learning experience. + +I hope the others can learn from me that it doesn’t matter who you are, where are you from and how old you are, you can make a difference in the community. I hope to inspire others to become more involved in the community, and to not be afraid to share their thoughts or ideas, or to post something because they think it is not cool enough. Through this process of mutual learning and support, we can accelerate our professional development and achieve our goals. So don't hold back, take initiative, and be an active contributor to this amazing community! + +## Anything else interesting you want to tell us? + +I would like to mention my very first contribution to the community, a dbt commands cheatsheet. I made it because I was very new to dbt and wanted a resource where I could quickly check the available commands and what I could do with them. I made it for me, but then I thought it could help other beginners and shared it. I was incredibly surprised when it appeared in a dbt newsletter, and I think that was the starting point for me in the community. At this point, I knew everyone could contribute and felt more comfortable to do more of that. diff --git a/website/docs/community/spotlight/david-effiong.md b/website/docs/community/spotlight/david-effiong.md new file mode 100644 index 00000000000..013884b1c4e --- /dev/null +++ b/website/docs/community/spotlight/david-effiong.md @@ -0,0 +1,37 @@ +--- +id: david-effiong +title: David Effiong +description: | + I started my career as a data analyst but I currently work as a data engineer in a financial Institution. I have experience working in both large organisations and startups. I have been in the dbt community for about 1 year and 6 months. I found out about dbt while working at a startup where I implemented a modern data stack using BigQuery, Airbyte, Metabase, and dbt. Currently my stack in my large organisation includes Azure tools + dbt. (😁 Of course I had to use dbt!) I have a YouTube channel where I share learnings about data and productivity. The name of my channel is David Data, please check it out. I spoke at the first in-person Lagos dbt meetup about and I am currently an organiser of the Lagos dbt meetup. +image: /img/community/spotlight/david-effiong.jpg +pronouns: he/him +location: Lagos, Nigeria +jobTitle: Data Engineer +companyName: Sterling Bank PLC +organization: Young Data Professionals +socialLinks: + - name: Twitter + link: https://www.twitter.com/@david_uforo + - name: LinkedIn + link: https://www.linkedin.com/in/david-effiong + - name: YouTube + link: https://www.youtube.com/@daviddata +dateCreated: 2023-03-26 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community late 2021 when I joined the startup. I was a data team of one with little experience in the domain and dbt community was and has remained impactful to my career. With the help of the community I was able to build a data stack as a team of one because there was always support to answer questions I post on the community. The community is so rich with value from conversations that you can read through threads and learn best practices or diverse approaches to problem solving. The dbt community has also been of great help to me in my current organisation in implementing dbt as part of the stack for data quality assurance purposes. The community is open to support anyone regardless of nationality or skill level and I am happy and grateful to be a part of this community. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I identify with Opeyemi Fabiyi as a community leader. Opeyemi introduced me to dbt as a tool and as a community. Based on this belief in the power of communities, he went on to start Young Data Professionals, pioneered dbt meetups in Lagos, Nigeria, and also spoke at Coalesce 2022. I am looking to grow my leadership in the community by interacting more in community conversations, organizing more dbt meetups this year and also by continuing to share my dbt learning videos on my YouTube Channel. + +## What have you learned from community members? What do you hope others can learn from you? + +I have learned solutions to technical problems from community members. I have also learned empathy and patience from community members while interacting with others. I hope I can provide technical solutions to other community members and also do it with patience and empathy. I also hope others can learn to be more involved in the community because the community has only grown because of people, and as more people get involved, more impact is made. + +## Anything else interesting you want to tell us? + +Outside of work, I play the piano and sing in the choir. I also write a faith based blog, [The Writings of David Uforo](https://daviduforo.wordpress.com/). You may want to check out. diff --git a/website/docs/community/spotlight/emily-riederer.md b/website/docs/community/spotlight/emily-riederer.md new file mode 100644 index 00000000000..6e5fd2f6451 --- /dev/null +++ b/website/docs/community/spotlight/emily-riederer.md @@ -0,0 +1,48 @@ +--- +id: emily-riederer +title: Emily Riederer +description: | + I'm a long-time dbt user and have been an active community member for a few years. + + Professionally, I've led a variety of data teams at Capital One spanning analytics, modeling, innersource data tools, and data infrastructure. The common denominator of all of these roles has been the overwhelming importance of high quality data processing pipelines. Outside of work, I enjoy doing pro bono projects and applying my same skillset to scrappier environments. + + My work with the dbt community is motivated by a passion for data quality and developer tooling. Some of my recent contributions include maintaining the dbtplyr package, speaking at Coalesce 2021, and writing a dbt Developer Blog post about my PR to the dbt-utils test suite. +image: /img/community/spotlight/emily-riederer.jpg +pronouns: she/her +location: Chicago, IL +jobTitle: Senior Manager - Data Science & Analytics +companyName: Capital One +organization: rOpenSci Editorial Board +socialLinks: + - name: Twitter + link: https://twitter.com/emilyriederer + - name: LinkedIn + link: https://linkedin.com/in/emilyriederer + - name: Website + link: https://emilyriederer.com + - name: GitHub + link: https://github.com/emilyriederer + - name: Mastodon + link: https://mastodon.social/@emilyriederer +dateCreated: 2023-03-22 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I have been involved in the dbt community for a few years now. While I enjoy being actively engaged, one of my favorite parts is simply "lurking" on the Slack channels. The data space is moving so fast right now with so many different competing frameworks, tools, and ideas. At the same time, data work tends to be less discussed and publicly shared than analysis methods (e.g. new modeling packages) due to data privacy and IP. I've found no better place to "drink from the firehouse" and benefit from the insights of others questions, challenges, and successes. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Two community members that really inspire me are Claire Carroll and Joel Labes. I think both showcase excellence in technical best practices, crystal-clear communication of technical concepts in their prolific writing, and a passion for building community and creating on-ramps. That mix of so-called 'hard' and 'soft' skills adds so much to the community and helps empower every member to be their best. I'm always looking to balance the time I spend growing my skills along both dimensions. + +## What have you learned from community members? What do you hope others can learn from you? + +Given my passion for data quality and the design side of data, I particularly enjoy thinking about data modeling and learning from the community's experience with the variety of classical and novel frameworks for designing resilient, flexible datamarts. + +As a passionate fan of open-source (and the also-thriving #rstats community), I hope to inspire others to create more packages and PRs that expand the developer toolkit. I also particularly enjoy discussing my thoughts on data quality and avoiding data disasters. + +## Anything else interesting you want to tell us? + +test +My passion for open-source tools and open-knowledge extends beyond dbt. I also enjoy serving on the editorial board for rOpenSci to champion the creation of open-source research software, reviewing technical books for CRC Press, doing pro-bono data projects, and sharing my own learnings through conference talks and writing (including on my website, guest blogs, and books including [R Markdown Cookbook](https://bookdown.org/yihui/rmarkdown-cookbook/) and [97 Things Every Data Engineer Should Know](https://www.oreilly.com/library/view/97-things-every/9781492062400/)). diff --git a/website/docs/community/spotlight/fabiyi-opeyemi.md b/website/docs/community/spotlight/fabiyi-opeyemi.md new file mode 100644 index 00000000000..f26ee27910b --- /dev/null +++ b/website/docs/community/spotlight/fabiyi-opeyemi.md @@ -0,0 +1,41 @@ +--- +id: fabiyi-opeyemi +title: Opeyemi Fabiyi +description: | + I'm an Analytics Engineer with Data Culture, a Data Consulting firm where I use dbt regularly to help clients build quality-tested data assets. I've also got a background in financial services and supply chain. I'm passionate about helping organizations to become data-driven and I majorly use dbt for data modeling, while the other aspect of the stack is largely dependent on the client infrastructure I'm working for, so I often say I'm tool-agnostic. 😀 + + I'm the founder of Nigeria's Young Data Professional Community. I'm also the organizer of the Lagos dbt Meetup which I started, and one of the organizers of the DataFest Africa Conference. I became an active member of the dbt Community in 2021 & spoke at Coalesce 2022. +image: /img/community/spotlight/fabiyi-opeyemi.jpg +pronouns: he/him +location: Lagos, Nigeria +jobTitle: Senior Analytics Engineer +companyName: Data Culture +organization: Young Data Professionals (YDP) +socialLinks: + - name: Twitter + link: https://twitter.com/Opiano_1 + - name: LinkedIn + link: https://www.linkedin.com/in/opeyemifabiyi/ +dateCreated: 2023-07-02 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the [dbt Slack community](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) in 2021, and it has been an experience getting to learn from thought leaders in the space and stay in touch with cutting-edge innovation in the data space. The community has helped me become a better engineer by reading different responses to questions on Slack, and seeing genuine support from community members help other members tackle and solve their difficult problems is inspiring and has allowed me to model my community (YDP & the Lagos dbt Meetup) through that lens. I randomly enter the dbt Slack daily to read and learn from different channels. I love the sense of community that resonates in the dbt Slack channel, and the good news is that I got my current role from the #jobs channel from a post from Data Culture Co-Founder. So you can stay glued to that page if you are looking for a job role. + +The dbt community greatly impacted my previous role as a one-person data team. The community became the team I didn't have, providing all the necessary support and guidance I needed to deliver great value for the company excellently, and my experience with the community was the inspiration for my Coalesce talk in 2022 on how to leverage the dbt community as a data team of one. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Many great leaders inspire me in the dbt community; Joel Labes for constantly interacting with new folks and providing that safe space for everyone to ask any question, no matter how dumb you may think your question may be. He will give a response that will solve your problem; Benn Stancil for his vast experience and how he communicates it well with humour in his Friday night Substack, a newsletter I look forward to, which helps me stay current with recent trends in the global data space. + +Both of them resonate with the kind of leader I want to grow in the dbt Community; to be vast, experienced and readily available to provide support and guidance and help people solve problems and grow their careers. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned how to show empathy as a data professional and be a great engineer from various best practices around working with data. I also want others to know that irrespective of their current level of expertise or maturity in their data career, they can make an impact by getting involved in the community and helping others grow. + +## Anything else interesting you want to tell us? + +Maybe, I will consider DevRel as a career sometime because of my innate passion and love for community and people. Several folks tell me I'm a strong DevRel talent and a valuable asset for any product-led company. If you need someone to bounce ideas off of or discuss😃 your community engagement efforts, please feel free to reach out. diff --git a/website/docs/community/spotlight/faith-lierheimer.md b/website/docs/community/spotlight/faith-lierheimer.md new file mode 100644 index 00000000000..3edb839bb1d --- /dev/null +++ b/website/docs/community/spotlight/faith-lierheimer.md @@ -0,0 +1,47 @@ +--- +id: faith-lierheimer +title: Faith Lierheimer +description: | + I've been a dbt Community member for around a year and a half. I come to the data world from teaching and academic research. Working in data fuses the aspects of those careers that I like the most, which are technical problem solving, and helping non-technical audiences understand data and what they can do with it. I have a dream stack with Databricks, dbt, and Looker. + + Professionally, I help shippers of perishable goods (everything from blueberries to childhood vaccinations) understand the risks their goods face in transit and how to mitigate them.This reduces food and medical waste worldwide. + + You can read more about these interests at faithfacts.substack.com. +image: /img/community/spotlight/faith-lierheimer.jpg +pronouns: she/her +location: Denver, CO, USA +jobTitle: Data Analyst II +companyName: Parsyl +organization: Data Angels +socialLinks: + - name: Twitter + link: https://twitter.com/FaithLierheimer + - name: LinkedIn + link: https://www.linkedin.com/in/faithlierheimer/ + - name: Substack + link: https://faithfacts.substack.com/ + - name: Data Folks + link: https://data-folks.masto.host/@faithlierheimer +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community early in 2022 hoping to find technical help with dbt, and instead found a wide support network of career-minded data professionals. Being in the dbt community has helped me find my niche in the data world, and has helped me discover ways I can grow my career and technical acumen. Being in this community has been huge in easing my career transition from teaching into data. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I primarily conceptualize of leadership as raising the floor beneath everyone, rather than enabling a few to touch its vaulted ceiling. As I gain more experience, I'd be delighted to be a resource for fellow career changers and teachers in transition. + +And, I love to goof in #roast-my-graph in the dbt Slack. [Come join](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) that channel, it's a hoot and a holler. + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned a lot from community members, but most notably and concretely, I've actually gotten excellent visualization advice in #roast-my-graph. I've taken graphs there several times where I felt stuck on the presentation and have learned a lot about effective vizzes from my peers there. + +As I continue to gain experience, I hope others can learn from me what a successful career change looks like. And, ultimately, to take the work seriously but to not take ourselves that seriously. + +## Anything else interesting you want to tell us? + +I have a black cat with one eye named Gus and my purpose is now to give him the best existence possible. diff --git a/website/docs/community/spotlight/jing-yu-lim.md b/website/docs/community/spotlight/jing-yu-lim.md new file mode 100644 index 00000000000..a3d1784293f --- /dev/null +++ b/website/docs/community/spotlight/jing-yu-lim.md @@ -0,0 +1,41 @@ +--- +id: jing-yu-lim +title: Jing Yu Lim +description: | + For ~3 years, I was a Product Analyst at Grab, a ride-hailing and food delivery app in Southeast Asia, before taking on an Analytics Engineering role in Spenmo, a B2B Fintech startup. I joined a tech company as an analyst in June 2023, but was recently impacted by a layoff. I'm also one of the co-organisers of the Singapore dbt Meetup! + + My story with dbt started in Jan 2022, when I joined Spenmo where I taught myself dbt, mainly via dbt's documentation and Slack community. We used Snowflake as our data warehouse, and Holistics for BI. I spoke about data self-serve and Spenmo's journey with dbt at multiple meetups. +image: /img/community/spotlight/jing-lim.jpg +pronouns: she/her +location: Singapore, Singapore +jobTitle: I'm open to work! +companyName: "" +organization: "" +socialLinks: + - name: LinkedIn + link: https://www.linkedin.com/in/limjingyu/ +dateCreated: 2023-07-01 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined the dbt community late January 2022, while setting up Spenmo's first dbt project. I was completely new to dbt, and relied heavily on the #advice-dbt-help channel in dbt Slack whenever I got stuck. I have learnt so much from reading discussions in other channels as well (e.g. #leading-data-teams, #advice-mock-interviews, #db-snowflake, #tools-holistics). + +The dbt community also helped me expand my professional network, where I met so many amazing individuals! It all started with #local-singapore which was created by community member Jolanda Zwagemaker sometime in April 2022. We organised dinners to connect with one another, which eventually led to an opportunity to run Singapore dbt Meetup (HUGE thank you to dbt) - it is heartwarming to see connections forged between many attendees of the meetup, where we also learn from one another. It really does feel like a community! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +Claire Carroll and Mila Page! My very first touchpoint with dbt was their articles in [The Analytics Engineering Guide](https://www.getdbt.com/analytics-engineering/). I remember relating to it so much that I was saying "YES" to every other line I read, and sending text snippets to my friends. + +To me, Analytics Engineering could help overcome certain challenges I face as an analyst, and make the job feels less like a "hamster wheel." As the concept of analytics engineering is fairly new in Singapore, I feel the need to spread the word and bring about a mindset shift among not just data teams, but anyone who needs to work with a data team. + +## What have you learned from community members? What do you hope others can learn from you? + +One of my favourite presentations from the Singapore dbt Meetup was ["How would the ideal Semantic Layer look like?"](https://docs.google.com/presentation/d/1t1ts04b7qA-BVlV3qbNZ4fI-MSZn0iL6_FhsaWhJk_0/edit?usp=sharing ) by fellow community member Thanh Dinh from Holistics. It taught me a new perspective on metrics: they could be like dbt models, where dependencies can be set up between metric models. + +I definitely have so much more to learn as an individual, but I hope to share some of my tips and lessons in terms of data modelling with others. + +## Anything else interesting you want to tell us? + +Thank you dbt for enabling us to run meetups! It has been critical for ensuring a great experience for the Singapore community. Also a huge shoutout to Amada, the Global Community Development Lead, for always being super helpful and supportive despite the 12-hour time difference! diff --git a/website/docs/community/spotlight/josh-devlin.md b/website/docs/community/spotlight/josh-devlin.md new file mode 100644 index 00000000000..1a1db622209 --- /dev/null +++ b/website/docs/community/spotlight/josh-devlin.md @@ -0,0 +1,39 @@ +--- +id: josh-devlin +title: Josh Devlin +description: | + After "discovering" dbt in early 2020, I joined the community and used it as a learning tool while I tried to get dbt introduced at my company. By helping others, I learned about common pitfalls, best practices, and the breadth of the tool. When it came time to implement it months later, I already felt like an expert! + + In December 2020 I attended the first virtual Coalesce conference, attending all 4 days across 3 time zones! I found my quirky-nerdy-purple-people, and felt at home. + + 3 years later I had the pleasure of presenting at my first dbt Meetup in Sydney, and then at the first in-person Coalesce in New Orleans. My passion is helping people, and I'm glad that the dbt community gives me a place to do that! +image: /img/community/spotlight/josh-devlin.jpg +pronouns: he/him +location: Melbourne, Australia (but spent most of the last decade in Houston, USA) +jobTitle: Senior Analytics Engineer +companyName: Canva +organization: "" +socialLinks: + - name: Twitter + link: https://twitter.com/JayPeeDevlin + - name: LinkedIn + link: https://www.linkedin.com/in/josh-devlin/ +dateCreated: 2023-06-27 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I have been a subscriber to 'The Data Science Roundup' (now ['The Analytics Engineering Roundup'](https://roundup.getdbt.com/)) since its inception, so I knew that dbt existed from the very beginning, since the time that dbt Labs was still called Fishtown Analytics. Despite that, I never really understood what the tool was or how it fit in until early 2020 when I first started experimenting with the tool. I immediately joined the community and found it warm and welcoming, so I started to help people where I could and never stopped! + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I like to think I represent the warm, helpful vibes of the early days of the Community, where folks like Claire Carroll warmly welcomed myself and others! + +## What have you learned from community members? What do you hope others can learn from you? + +I've learned that the more you give, the more you get. I've put hundreds of hours into helping other people in the community, but I've gotten all that back and much more. I hope I can encourage others to give of themselves and reap the rewards later! + +## Anything else interesting you want to tell us? + +In a previous life I was an orchestral musician! diff --git a/website/docs/community/spotlight/karen-hsieh.md b/website/docs/community/spotlight/karen-hsieh.md new file mode 100644 index 00000000000..1a5cc8c4788 --- /dev/null +++ b/website/docs/community/spotlight/karen-hsieh.md @@ -0,0 +1,63 @@ +--- +id: karen-hsieh +title: Karen Hsieh +description: | + I’m a Product Manager who builds company-wide data literacy and empowers the product team to create values for people and grow the company. + + Utilizing dbt, I replaced time-consuming spreadsheets by creating key business metric dashboards that improved data literacy, enabling conversations about product and business. + + Since joining the dbt community in 2019, I’ve led the creation of the #local-taiwan dbt Slack channel, organized 10 Taipei dbt Meetups and spoken at Coalesce 2022. + + I write about how data empowers products on Medium. I focus on understanding how users utilize and think about the product based on facts. +image: /img/community/spotlight/karen-hsieh.jpg +pronouns: she/her +location: Taipei, Taiwan +jobTitle: Director of Product & Data +companyName: ALPHA Camp +organization: "" +socialLinks: + - name: Twitter + link: https://twitter.com/ijac_wei + - name: LinkedIn + link: https://www.linkedin.com/in/karenhsieh/ + - name: Medium + link: https://medium.com/@ijacwei +dateCreated: 2023-03-24 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +As a Product Manager with a passion for data, I began using dbt in 2019 when it was introduced to me by Richard Lee, CTO & co-founder of [iCook.tw](http://icook.tw/). We were likely one of the first companies in Taiwan to use dbt and as the sole dbt model writer in our startup, I worked with BigQuery, dbt plus Metabase, with the occasional use of Data Studio (now Looker Studio). + +I joined the dbt Slack community to connect with others who use dbt and found the forum to be more informative than formal documentation. Viewing that documentation is formal. Viewing the questions from people is real! I love the sense of community that is built around building things together. + +dbt demonstrates how I want to build products—by creating a direct connection between the users and those who build the product. I conduct user interviews in a style that resembles chatting with friends who share similar interests. This approach encourages me to get to know users more directly. + +In January 2022, Laurence Chen (REPLWARE) and a friend asked me if I knew anyone else in Taiwan using dbt, which inspired me to request and maintain a local channel for the Taiwan community in the dbt Slack, #local-taipei. I had no prior experience with community-building and wasn't an engineer, but I searched the #introductions channel and found Allen Wang (Zitara Technologies). With him and Laurence, we got started on organizing our first meetup in Taiwan. Aftering running a few successful meetups on our own, we got in touch with dbt Labs, and are now the official Taiwan dbt Meetup organizers. We have now run 10 meetups in total, with two more planned in May and June 2023. We've never actively sought to increase membership, but the meetups have continued to grow and attract passionate people who are always willing to share their experiences. (See more about the story [here](https://medium.com/dbt-local-taiwan/how-does-dbt-local-taipei-get-started-ff58489c80fa).) + +Through dbt, I've learned that people are friendly and willing to help when asked. Publicly asking questions is a great way to practice describing questions more effectively and simplifying them for others to understand, a skill that has come in handy for solving problems beyond those related to data. + +I've also developed a support system where I know where to go for help with various questions, such as data, leadership, product, business and hiring. It all started with the dbt community 💜. + +What’s more, Laurence pushed me to introduce dbt at [COSCUP Taiwan 2022](https://coscup.org/2022/zh-TW/session/SRKVLQ), the largest open-source event in Taiwan, and I was thrilled to be accepted to speak at Coalesce 2022. Preparing talks and answering questions were valuable training, and with the help of many people, I revised my presentation multiple times, refining my opinions and structuring my thinking. I learned a lot from these experiences. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +I admire dbt Lab’s co-founder Tristan Handy’s openness to chat and share his thoughts, as well as his open-minded approach to product-building and his mission to encourage data analysts to work more like software developers. When I spoke with dbt employees at Coalesce 2022, it was clear that they aligned with the company mission and loved the product that they worked on. + +I am also inspired by Benn Stancil's (Mode) ability to [write a newsletter every Friday](https://benn.substack.com/) that uncovers questions and trends in a clear and concise way, sparking discussions and inspiring people. While attending Coalesce annually from 2020 to 2022, I found Emilie Schario’s (co-founder, Turbine) presentations enlightening and I could see the evolution of how data teams work over the course of her three successive talks. + +Additionally, I am inspired by Jolanda Zwagemaker (Wise), who created #local-singapore to connect with others when she moved from London to Singapore and even flew to Taiwan to share her experiences with Incremental. + +I never think about leadership in the community. I want to share my thoughts, ideas, and experiences to inspire others and start discussions. I also hope to encourage more people in Taiwan to join conversations with others from around the world, as I believe this will help us learn and grow together. + +## What have you learned from community members? What do you hope others can learn from you? + +Being a part of the dbt community has taught me the value of collaboration and learning from others. As someone who started using dbt as a solo PM, it's been reassuring to find that I'm not alone. It's inspiring to see people from diverse backgrounds, including those without engineering experience, eager to learn and become analytics engineers. It’s incredible to see the progress occurring in people’s skills and even in their careers. + +I hope that my journey from learning SQL in 2019 to leading a small data team now can inspire others. I believe that sharing knowledge and asking questions are great ways to learn and grow. I want others to know that it's okay not to have all the answers and that their experiences are valuable. Everyone has a unique perspective and experience that can help others. + +## Anything else interesting you want to tell us? + +Thank you 💕. I gain a lot from the community and dbt. diff --git a/website/docs/community/spotlight/owen-prough.md b/website/docs/community/spotlight/owen-prough.md new file mode 100644 index 00000000000..cc8ce37221e --- /dev/null +++ b/website/docs/community/spotlight/owen-prough.md @@ -0,0 +1,41 @@ +--- +id: owen-prough +title: Owen Prough +description: | + Well met, data adventurer! My professional data history is mostly USA healthcare-related (shout out to ANSI X12 claim files) while working with large (10k+ employee) software companies and small (but growing!) startups. My constant companion for the last decade has been SQL of various flavors https://xkcd.com/927/, and these days I mostly work with PostgreSQL, AWS Athena, and Snowflake. I think SQL is a great tool to solve interesting problems. + + Oh and also dbt. I haven't done anything too fancy with dbt, but I have contributed to the dbt-athena adapter and a few different packages. Mostly I lurk on Slack, cleverly disguised as a duck. It's a professional goal of mine to someday attend Coalesce. +image: /img/community/spotlight/owen-prough.jpg +pronouns: he/him +location: Milwaukee, USA +jobTitle: Data Engineer +companyName: Sift Healthcare +organization: "" +socialLinks: + - name: LinkedIn + link: https://linkedin.com/in/owen-prough +dateCreated: 2023-06-28 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I committed dbt_project.yml to the company git repo in July 2021 so I've been hanging out with all of you for about 2 years. What I love the most about dbt is how easy it is to write data tests. Writing data tests without dbt was painful, but now with all the tests we have in dbt I have a dramatically improved confidence in our data quality. + +The wider dbt community is also a reliable and constant source of education. I only interact in a few Slack channels, but I read *many* Slack channels to see what others are doing in the Analytics Engineering space and to get ideas about how to improve the processes/pipelines at my company. Y'all are great. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +This is an interesting question. I think I most identify with or am inspired by [Josh Devlin](./josh-devlin), who seems to be everywhere on Slack and very knowledgeable/helpful. I also want to know things and pay it forward. + +Also shout out to [Faith Lierheimer](./faith-lierheimer), whose contributions to [#roast-my-graph](https://www.getdbt.com/community/join-the-community/?utm_medium=internal&utm_source=docs&utm_campaign=q3-2024_dbt-spotlight_aw&utm_content=____&utm_term=all___) always make me laugh and/or weep. + +## What have you learned from community members? What do you hope others can learn from you? + +The [public documentation for dbt](https://docs.getdbt.com/docs/introduction) is quite good. You should bookmark it and make it a personal goal to read through it all. There are a lot of cool things that dbt can do. + +Also I think it's really cool to see newcomers asking questions on Slack/[Discourse](https://discourse.getdbt.com/) and then see those same people answering others' questions. It speaks to the value we all get from dbt that folks want to give back to the community. + +## Anything else interesting you want to tell us? + +Did you notice how I avoided starting a sentence with "dbt"? That's because I know the standard is lowercase, but starting a sentence with a lowercase word looks weird to my eyes. diff --git a/website/docs/community/spotlight/shinya-takimoto.md b/website/docs/community/spotlight/shinya-takimoto.md new file mode 100644 index 00000000000..20d0954ee2f --- /dev/null +++ b/website/docs/community/spotlight/shinya-takimoto.md @@ -0,0 +1,41 @@ +--- +id: shinya-takimoto +title: Shinya Takimoto +description: | + I have about 3 years of dbt experience. I used to be in a large organization where the challenge was to create a quality analysis infrastructure for EC data managed by my department with a limited number of staff. It was then that I learned about dbt and I still remember the shock I felt when I ran a dbt run for the first time. + + Currently, I work for a startup called 10X. We provide a system that allows retailers to seamlessly launch online grocery services in an O2O model. + + I am also actively involved in dbt Community activities, starting the #local-tokyo channel in dbt Slack, organizing the Tokyo dbt Meetup event and writing translations of dbt-related articles. In addition, I run a podcast called ModernDataStackRadio. +image: /img/community/spotlight/shinya-takimoto.jpg +pronouns: he/him +location: Tokyo, Japan +jobTitle: Analytics Engineer +companyName: 10X, Inc. +organization: Data Build Japan +socialLinks: + - name: Twitter + link: https://twitter.com/takimo + - name: LinkedIn + link: https://www.linkedin.com/in/shinya-takimoto-2793483a/ + - name: Website + link: https://takimo.tokyo/ +dateCreated: 2023-04-03 +hide_table_of_contents: true +--- + +## When did you join the dbt community and in what way has it impacted your career? + +I joined dbt Slack in late 2021 and led the creation of the #local-tokyo channel soon after, which I still maintain. Community activities gave me the opportunity to meet more dbt users. By sharing the knowledge and insights I had gained in my own company, I was able to connect with people who were struggling with the same issues and difficulties. The shared sense of common data usage challenges has led to networking and recognition of individuals and the companies they work for, which has increased my opportunities to work with many people. + +## What dbt community leader do you identify with? How are you looking to grow your leadership in the dbt community? + +We have been working to create a place in Japan where people who feel the same way about the potential of dbt can interact with each other. Now we have almost 400 members. We would like to support the creation of an environment where more information can be shared through articles and discussions, whereby the focus can be on companies and players who are working on advanced projects. Thereby, we can increase the number of connections among dbt users. + +## What have you learned from community members? What do you hope others can learn from you? + +Many members from enterprise companies to start-ups with various business sizes and a wide variety of business activities have joined the #local-tokyo channel in dbt Slack. + +Therefore, ideas and knowledge about data modeling and testing differ from one business domain to another. I believe that they provide the local community with a variety of new insights and perspectives that are surprising as well as curious. + +As a company that uses dbt in many production environments myself, I hope to share a lot of knowledge with the dbt Community. diff --git a/website/docs/dbt-cli/cli-overview.md b/website/docs/dbt-cli/cli-overview.md index 7a911a7eb95..3b96d4637bd 100644 --- a/website/docs/dbt-cli/cli-overview.md +++ b/website/docs/dbt-cli/cli-overview.md @@ -10,7 +10,7 @@ When using the command line, you can run commands and do other work from the cur -Once you verify your dbt project is your working directory, you can execute dbt commands. A full list of dbt commands can be found in the [reference section](dbt-commands). +Once you verify your dbt project is your working directory, you can execute dbt commands. A full list of dbt commands can be found in the [reference section](/reference/dbt-commands). diff --git a/website/docs/docs/about-setup.md b/website/docs/docs/about-setup.md new file mode 100644 index 00000000000..ceb34a5ccbb --- /dev/null +++ b/website/docs/docs/about-setup.md @@ -0,0 +1,34 @@ +--- +title: About dbt setup +id: about-setup +description: "About setup of dbt Core and Cloud" +sidebar_label: "About dbt setup" +pagination_next: "docs/environments-in-dbt" +pagination_prev: null +--- + +dbt compiles and runs your analytics code against your data platform, enabling you and your team to collaborate on a single source of truth for metrics, insights, and business definitions. There are two options for deploying dbt: + +**dbt Cloud** runs dbt Core in a hosted (single or multi-tenant) environment with a browser-based interface. The intuitive user interface aids you in setting up the various components. dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring, and alerting. It also offers an integrated development environment (IDE) and allows you to develop and run dbt commands from your local command line (CLI) or code editor. + +**dbt Core** is an open-source command line tool that can be installed locally in your environment, and communication with databases is facilitated through adapters. + +If you're not sure which is the right solution for you, read our [What is dbt?](/docs/introduction) and our [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) articles to help you decide. If you still have questions, don't hesitate to [contact us](https://www.getdbt.com/contact/). + +To begin configuring dbt now, select the option that is right for you. + +
+ + + + + +
diff --git a/website/docs/docs/about/overview.md b/website/docs/docs/about/overview.md index f4ab1e6396b..e34866fa3fe 100644 --- a/website/docs/docs/about/overview.md +++ b/website/docs/docs/about/overview.md @@ -1,10 +1,8 @@ --- -title: "Overview" +title: "What is dbt? " id: "overview" --- -# What is dbt? - dbt is a productivity tool that helps analysts get more done and produce higher quality results. Analysts commonly spend 50-80% of their time modeling raw data—cleaning, reshaping, and applying fundamental business logic to it. dbt empowers analysts to do this work better and faster. @@ -23,7 +21,7 @@ To test a data model, an analyst asserts something to be true about the underlyi One dbt user has this to say: *“At this point when I have a new question, I can answer it 10-100x faster than I could before.”* Here’s how: -- dbt allows analysts avoid writing boilerplate and : managing transactions, dropping tables, and managing schema changes. All business logic is expressed in SQL `SELECT` statements, and dbt takes care of . +- dbt allows analysts to avoid writing boilerplate and : managing transactions, dropping tables, and managing schema changes. All business logic is expressed in SQL `SELECT` statements, and dbt takes care of . - dbt creates leverage. Instead of starting at the raw data with every analysis, analysts instead build up reusable data models that can be referenced in subsequent work. - dbt includes optimizations for data model materialization, allowing analysts to dramatically reduce the time their queries take to run. @@ -40,7 +38,7 @@ It does. Here’s how: # Why SQL? -While there are a large number of great languages for manipulating data, we’ve chosen SQL as the primary data transformation language at the heart of dbt. There are three reasons for this: +While there are a large number of great languages for manipulating data, we’ve chosen SQL as the primary [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) language at the heart of dbt. There are three reasons for this: 1. SQL is a very widely-known language for working with data. Using SQL gives the largest-possible group of users access. 2. Modern analytic databases are extremely performant and have sophisticated optimizers. Writing data transformations in SQL allows users to describe transformations on their data but leave the execution plan to the underlying database technology. In practice, this provides excellent results with far less work on the part of the author. @@ -48,8 +46,8 @@ While there are a large number of great languages for manipulating data, we’ve # What databases does dbt currently support? -See [Supported Data Platforms](supported-data-platforms) to view the full list of supported databases, warehouses, and query engines. +See [Supported Data Platforms](/docs/supported-data-platforms) to view the full list of supported databases, warehouses, and query engines. # How do I get started? -dbt is open source and completely free to download and use. See our [Getting Started guide](introduction) for more. +dbt is open source and completely free to download and use. See our [Getting Started guide](/docs/introduction) for more. diff --git a/website/docs/docs/build/about-metricflow.md b/website/docs/docs/build/about-metricflow.md new file mode 100644 index 00000000000..4910f12a792 --- /dev/null +++ b/website/docs/docs/build/about-metricflow.md @@ -0,0 +1,304 @@ +--- +title: "About MetricFlow" +id: about-metricflow +description: "Learn more about MetricFlow and its key concepts" +sidebar_label: About MetricFlow +tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/join-logic" +pagination_prev: null +--- + +This guide introduces MetricFlow's fundamental ideas for people new to this feature. MetricFlow, which powers the dbt Semantic Layer, helps you define and manage the logic for your company's metrics. It's an opinionated set of abstractions and helps data consumers retrieve metric datasets from a data platform quickly and efficiently. + +MetricFlow handles SQL query construction and defines the specification for dbt semantic models and metrics. It allows you to define metrics in your dbt project and query them with [MetricFlow commands](/docs/build/metricflow-commands) whether in dbt Cloud or dbt Core. + +Before you start, consider the following guidelines: + +- Define metrics in YAML and query them using these [new metric specifications](https://github.com/dbt-labs/dbt-core/discussions/7456). +- You must be on [dbt version](/docs/dbt-versions/upgrade-core-in-cloud) 1.6 or higher to use MetricFlow. +- Use MetricFlow with Snowflake, BigQuery, Databricks, Postgres (dbt Core only), or Redshift. +- Discover insights and query your metrics using the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and its diverse range of [available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations). You must have a dbt Cloud account on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). + +## MetricFlow + +MetricFlow is a SQL query generation tool designed to streamline metric creation across different data dimensions for diverse business needs. +- It operates through YAML files, where a semantic graph links language to data. This graph comprises [semantic models](/docs/build/semantic-models) (data entry points) and [metrics](/docs/build/metrics-overview) (functions for creating quantitative indicators). +- MetricFlow is a [BSL package](https://github.com/dbt-labs/metricflow) with code source available, and compatible with dbt version 1.6 and higher. Data practitioners and enthusiasts are highly encouraged to contribute. +- As a part of the dbt Semantic Layer, MetricFlow empowers organizations to define metrics using YAML abstractions. +- To query metric dimensions, dimension values, and validate configurations, use [MetricFlow commands](/docs/build/metricflow-commands). + + +**Note** — MetricFlow doesn't support dbt [builtin functions or packages](/reference/dbt-jinja-functions/builtins) at this time, however, support is planned for the future. + +MetricFlow abides by these principles: + +- **Flexibility with completeness**: Define metric logic using flexible abstractions on any data model. +- **DRY (Don't Repeat Yourself)**: Minimize redundancy by enabling metric definitions whenever possible. +- **Simplicity with gradual complexity:** Approach MetricFlow using familiar data modeling concepts. +- **Performance and efficiency**: Optimize performance while supporting centralized data engineering and distributed logic ownership. + + + +### Semantic graph + +We're introducing a new concept: a "semantic graph". It's the relationship between semantic models and YAML configurations that creates a data landscape for building metrics. You can think of it like a map, where tables are like locations, and the connections between them (edges) are like roads. Although it's under the hood, the semantic graph is a subset of the , and you can see the semantic models as nodes on the DAG. + +The semantic graph helps us decide which information is available to use for consumption and which is not. The connections between tables in the semantic graph are more about relationships between the information. This is different from the DAG, where the connections show dependencies between tasks. + +When MetricFlow generates a metric, it uses its SQL engine to figure out the best path between tables using the framework defined in YAML files for semantic models and metrics. When these models and metrics are correctly defined, they can be used downstream with dbt Semantic Layer's integrations. + +### Semantic models + +Semantic models are the starting points of data and correspond to models in your dbt project. You can create multiple semantic models from each model. Semantic models have metadata, like a data table, that define important information such as the table name and primary keys for the graph to be navigated correctly. + +For a semantic model, there are three main pieces of metadata: + +* [Entities](/docs/build/entities) — The join keys of your semantic model (think of these as the traversal paths, or edges between semantic models). +* [Dimensions](/docs/build/dimensions) — These are the ways you want to group or slice/dice your metrics. +* [Measures](/docs/build/measures) — The aggregation functions that give you a numeric result and can be used to create your metrics. + +### Metrics + +Metrics, which is a key concept, are functions that combine measures, constraints, or other mathematical functions to define new quantitative indicators. MetricFlow uses measures and various aggregation types, such as average, sum, and count distinct, to create metrics. Dimensions add context to metrics and without them, a metric is simply a number for all time. You can define metrics in the same YAML files as your semantic models, or create a new file. + +MetricFlow supports different metric types: + +- [Cumulative](/docs/build/cumulative) — Aggregates a measure over a given window. +- [Derived](/docs/build/derived) — An expression of other metrics, which allows you to do calculations on top of metrics. +- [Ratio](/docs/build/ratio) — Create a ratio out of two measures, like revenue per customer. +- [Simple](/docs/build/simple) — Metrics that refer directly to one measure. + +## Use case + +In the upcoming sections, we'll show how data practitioners currently calculate metrics and compare it to how MetricFlow makes defining metrics easier and more flexible. + +The following example data is based on the Jaffle Shop repo. You can view the complete [dbt project](https://github.com/dbt-labs/jaffle-sl-template). The tables we're using in our example model are: + +- `orders` is a production data platform export that has been cleaned up and organized for analytical consumption +- `customers` is a partially denormalized table in this case with a column derived from the orders table through some upstream process + + + +To make this more concrete, consider the metric `order_total`, which is defined using the SQL expression: + +`select sum(order_total) as order_total from orders` +This expression calculates the revenue from each order by summing the order_total column in the orders table. In a business setting, the metric order_total is often calculated according to different categories, such as" +- Time, for example `date_trunc(ordered_at, 'day')` +- Order Type, using `is_food_order` dimension from the `orders` table. + +### Calculate metrics + +Next, we'll compare how data practitioners currently calculate metrics with multiple queries versus how MetricFlow simplifies and streamlines the process. + + + + +The following example displays how data practitioners typically would calculate the `order_total` metric aggregated. It's also likely that analysts are asked for more details on a metric, like how much revenue came from new customers. + +Using the following query creates a situation where multiple analysts working on the same data, each using their own query method — this can lead to confusion, inconsistencies, and a headache for data management. + +```sql +select + date_trunc('day',orders.ordered_at) as day, + case when customers.first_ordered_at is not null then true else false end as is_new_customer, + sum(orders.order_total) as order_total +from + orders +left join + customers +on + orders.customer_id = customers.customer_id +group by 1, 2 +``` + + + + +In the following three example tabs, use MetricFlow to define a semantic model that uses order_total as a metric and a sample schema to create consistent and accurate results — eliminating confusion, code duplication, and streamlining your workflow. + + + + +In this example, a measure named `order_total` is defined based on the order_total column in the `orders` table. + +The time dimension `metric_time` provides daily granularity and can be aggregated into weekly or monthly time periods. Additionally, a categorical dimension called `is_new_customer` is specified in the `customers` semantic model. + + +```yaml +semantic_models: + - name: orders # The name of the semantic model + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema + defaults: + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table. + - name: order_id + type: primary + - name: customer + type: foreign + expr: customer_id + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: metric_time + expr: cast(ordered_at as date) + type: time + type_params: + time_granularity: day + - name: customers # The name of the second semantic model + description: > + Customer dimension table. The grain of the table is one row per + customer. + model: ref('customers') #The name of the dbt model and schema + defaults: + agg_time_dimension: first_ordered_at + entities: # Entities, which usually correspond to keys in the table. + - name: customer + type: primary + expr: customer_id + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: is_new_customer + type: categorical + expr: case when first_ordered_at is not null then true else false end + - name: first_ordered_at + type: time + type_params: + time_granularity: day + + ``` + + + + +Similarly, you could then add additional dimensions like `is_food_order` to your semantic models to incorporate even more dimensions to slice and dice your revenue order_total. + +```yaml +semantic_models: + - name: orders + description: | + A model containing order data. The grain of the table is the order id. + model: ref('orders') #The name of the dbt model and schema + defaults: + agg_time_dimension: metric_time + entities: # Entities, which usually correspond to keys in the table + - name: order_id + type: primary + - name: customer + type: foreign + expr: customer_id + measures: # Measures, which are the aggregations on the columns in the table. + - name: order_total + agg: sum + dimensions: # Dimensions are either categorical or time. They add additional context to metrics and the typical querying pattern is Metric by Dimension. + - name: metric_time + expr: cast(ordered_at as date) + type: time + type_params: + time_granularity: day + - name: is_food_order + type: categorical +``` + + + +Imagine an even more complex metric is needed, like the amount of money earned each day from food orders from returning customers. Without MetricFlow the data practitioner's original SQL might look like this: + +```sql +select + date_trunc('day',orders.ordered_at) as day, + sum(case when is_food_order = true then order_total else null end) as food_order, + sum(orders.order_total) as sum_order_total, + food_order/sum_order_total +from + orders +left join + customers +on + orders.customer_id = customers.customer_id +where + case when customers.first_ordered_at is not null then true else false end = true +group by 1 +``` + +MetricFlow simplifies the SQL process via metric YAML configurations as seen below. You can also commit them to your git repository to ensure everyone on the data and business teams can see and approve them as the true and only source of information. + +```yaml +metrics: + - name: food_order_pct_of_order_total + description: Revenue from food orders in each store + label: "Food % of Order Total" + type: ratio + type_params: + numerator: food_order + denominator: active_customers + filter: | + {{ Dimension('customer__is_new_customer')}} = true +``` + + + + + + +## FAQs + +
+ Do my datasets need to be normalized? +
+
Not at all! While a cleaned and well-modeled data set can be extraordinarily powerful and is the ideal input, you can use any dataset from raw to fully denormalized datasets.

It's recommended that you apply quality data consistency, such as filtering bad data, normalizing common objects, and data modeling of keys and tables, in upstream applications. The Semantic Layer is more efficient at doing data denormalization instead of normalization.

If you have not invested in data consistency, that is okay. The Semantic Layer can take SQL queries or expressions to define consistent datasets.
+
+
+
+ Why is normalized data the ideal input? +
+
MetricFlow is built to do denormalization efficiently. There are better tools to take raw datasets and accomplish the various tasks required to build data consistency and organized data models. On the other end, by putting in denormalized data you are potentially creating redundancy which is technically challenging to manage, and you are reducing the potential granularity that MetricFlow can use to aggregate metrics.
+
+
+
+ Why not just make metrics the same as measures? +
+
One principle of MetricFlow is to reduce the duplication of logic sometimes referred to as Don't Repeat Yourself(DRY).

Many metrics are constructed from reused measures and in some cases constructed from measures from different semantic models. This allows for metrics to be built breadth-first (metrics that can stand alone) instead of depth-first (where you have multiple metrics acting as functions of each other).

Additionally, not all metrics are constructed off of measures. As an example, a conversion metric is likely defined as the presence or absence of an event record after some other event record.
+
+
+
+ How does the Semantic Layer handle joins? +
+
MetricFlow builds joins based on the types of keys and parameters that are passed to entities. To better understand how joins are constructed see our documentation on join types.

Rather than capturing arbitrary join logic, MetricFlow captures the types of each identifier and then helps the user to navigate to appropriate joins. This allows us to avoid the construction of fan out and chasm joins as well as generate legible SQL.
+
+
+
+ Are entities and join keys the same thing? +
+
If it helps you to think of entities as join keys, that is very reasonable. Entities in MetricFlow have applications beyond joining two tables, such as acting as a dimension.
+
+
+
+ Can a table without a primary or unique entities have dimensions? +
+
Yes, but because a dimension is considered an attribute of the primary or unique ent of the table, they are only usable by the metrics that are defined in that table. They cannot be joined to metrics from other tables. This is common in event logs.
+
+
+ + +## Related docs +- [Joins](/docs/build/join-logic) +- [Validations](/docs/build/validation) + diff --git a/website/docs/docs/build/analyses.md b/website/docs/docs/build/analyses.md index 2368c9b817d..74b138ac67a 100644 --- a/website/docs/docs/build/analyses.md +++ b/website/docs/docs/build/analyses.md @@ -1,6 +1,8 @@ --- title: "Analyses" +description: "Read this tutorial to learn how to use custom analyses when building in dbt." id: "analyses" +pagination_next: null --- ## Overview @@ -11,10 +13,10 @@ Any `.sql` files found in the `analyses/` directory of a dbt project will be com In practice, an analysis file might look like this (via the [open source Quickbooks models](https://github.com/dbt-labs/quickbooks)): - + ```sql --- analysis/running_total_by_account.sql +-- analyses/running_total_by_account.sql with journal_entries as ( diff --git a/website/docs/docs/build/build-metrics-intro.md b/website/docs/docs/build/build-metrics-intro.md new file mode 100644 index 00000000000..cdac51224ed --- /dev/null +++ b/website/docs/docs/build/build-metrics-intro.md @@ -0,0 +1,69 @@ +--- +title: "Build your metrics" +id: build-metrics-intro +description: "Learn about MetricFlow and build your metrics with semantic models" +sidebar_label: Build your metrics +tags: [Metrics, Semantic Layer, Governance] +hide_table_of_contents: true +pagination_next: "docs/build/sl-getting-started" +pagination_prev: null +--- + +Use MetricFlow in dbt to centrally define your metrics. As a key component of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), MetricFlow is responsible for SQL query construction and defining specifications for dbt semantic models and metrics. It uses familiar constructs like semantic models and metrics to avoid duplicative coding, optimize your development workflow, ensure data governance for company metrics, and guarantee consistency for data consumers. + + +MetricFlow allows you to: +- Intuitively define metrics in your dbt project +- Develop from your preferred environment, whether that's the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation) +- Use [MetricFlow commands](/docs/build/metricflow-commands) to query and test those metrics in your development environment +- Harness the true magic of the universal dbt Semantic Layer and dynamically query these metrics in downstream tools (Available for dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) accounts only). + + +
+ + + + + + + + + + + + + + +

+ + +## Related docs + +- [The dbt Semantic Layer: what's next](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/) blog +- [Get started with MetricFlow](/docs/build/sl-getting-started) + + diff --git a/website/docs/docs/build/cumulative-metrics.md b/website/docs/docs/build/cumulative-metrics.md new file mode 100644 index 00000000000..708045c1f3e --- /dev/null +++ b/website/docs/docs/build/cumulative-metrics.md @@ -0,0 +1,249 @@ +--- +title: "Cumulative metrics" +id: cumulative +description: "Use Cumulative metrics to aggregate a measure over a given window." +sidebar_label: Cumulative +tags: [Metrics, Semantic Layer] +--- + +Cumulative metrics aggregate a measure over a given accumulation window. If no window is specified, the window is considered infinite and accumulates values over all time. You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. + +This metric is common for calculating things like weekly active users, or month-to-date revenue. The parameters, description, and type for cumulative metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you are referencing. | Required | +| `window` | The accumulation window, such as 1 month, 7 days, 1 year. This can't be used with `grain_to_date`. | Optional | +| `grain_to_date` | Sets the accumulation grain, such as month will accumulate data for one month. Then restart at the beginning of the next. This can't be used with `window`. | Optional | + +The following displays the complete specification for cumulative metrics, along with an example: + +```yaml +metrics: + - name: The metric name # Required + description: The metric description # Optional + type: cumulative # Required + label: The value that will be displayed in downstream tools # Required + type_params: # Required + measure: The measure you are referencing # Required + window: The accumulation window, such as 1 month, 7 days, 1 year. # Optional. Can not be used with window. + grain_to_date: Sets the accumulation grain, such as month will accumulate data for one month, then restart at the beginning of the next. # Optional. Cannot be used with grain_to_date + +``` + +## Limitations +Cumulative metrics are currently under active development and have the following limitations: + +1. You can only use the [`metric_time` dimension](/docs/build/dimensions#time) to check cumulative metrics. If you don't use `metric_time` in the query, the cumulative metric will return incorrect results because it won't perform the time spine join. This means you cannot reference time dimensions other than the `metric_time` in the query. +2. If you use `metric_time` in your query filter but don't include "start_time" and "end_time," cumulative metrics will left-censor the input data. For example, if you query a cumulative metric with a 7-day window with the filter `{{ TimeDimension('metric_time') }} BETWEEN '2023-08-15' AND '2023-08-30' `, the values for `2023-08-15` to `2023-08-20` return missing or incomplete data. This is because we apply the `metric_time` filter to the aggregation input. To avoid this, you must use `start_time` and `end_time` in the query filter. + + +## Cumulative metrics example + + +:::tip MetricFlow time spine required + +You will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. + +::: + +Cumulative metrics measure data over a given window and consider the window infinite when no window parameter is passed, accumulating the data over all time. + +```yaml + +metrics: + - name: cumulative_order_total + label: Cumulative Order total (All Time) + description: The cumulative value of all orders + type: cumulative + type_params: + measure: order_total + - name: cumulative_order_total_l1m + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month +``` + +### Window options + +This section details examples of when you specify and don't specify window options. + + + + + +If a window option is specified, the MetricFlow framework applies a sliding window to the underlying measure. + +Suppose the underlying measure `customers` is configured to count the unique customers making orders at the Jaffle shop. + +```yaml +measures: + - name: customers + expr: customer_id + agg: count_distinct + +``` + +We can write a cumulative metric `weekly_customers` as such: + +``` yaml +metrics: + - name: weekly_customers # Define the measure and the window. + type: cumulative + type_params: + measure: customers + window: 7 days # Setting the window to 7 days since we want to track weekly active +``` + +From the sample YAML above, note the following: + +* `type`: Specify cumulative to indicate the type of metric. +* `type_params`: Specify the measure you want to aggregate as a cumulative metric. You have the option of specifying a `window`, or a `grain to date`. + +For example, in the `weekly_customers` cumulative metric, MetricFlow takes a sliding 7-day window of relevant customers and applies a count distinct function. + +If you omit the `window`, the measure will accumulate over all time. Otherwise, you can choose from granularities like day, week, quarter, or month, and describe the window using phrases like "7 days" or "1 month." + + + + + +You can use cumulative metrics without a window specified to obtain a running total. Suppose you have a log table with columns like: + +Suppose you (a subscription-based company for the sake of this example) have an event-based log table with the following columns: + +* `date`: a date column +* `user_id`: (integer) an ID specified for each user that is responsible for the event +* `subscription_plan`: (integer) a column that indicates a particular subscription plan associated with the user. +* `subscription_revenue`: (integer) a column that indicates the value associated with the subscription plan. +* `event_type`: (integer) a column that populates with +1 to indicate an added subscription, or -1 to indicate a deleted subscription. +* `revenue`: (integer) a column that multiplies `event_type` and `subscription_revenue` to depict the amount of revenue added or lost for a specific date. + +Using cumulative metrics without specifying a window, you can calculate running totals for metrics like the count of active subscriptions and revenue at any point in time. The following configuration YAML displays creating such cumulative metrics to obtain current revenue or the total number of active subscriptions as a cumulative sum: + +```yaml +measures: + - name: revenue + description: Total revenue + agg: sum + expr: revenue + - name: subscription_count + description: Count of active subscriptions + agg: sum + expr: event_type +metrics: + - name: current_revenue + description: Current revenue + label: Current Revenue + type: cumulative + type_params: + measure: revenue + - name: active_subscriptions + description: Count of active subscriptions + label: Active Subscriptions + type: cumulative + type_params: + measure: subscription_count + +``` + + + + + +### Grain to date + +You can choose to specify a grain to date in your cumulative metric configuration to accumulate a metric from the start of a grain (such as week, month, or year). When using a window, such as a month, MetricFlow will go back one full calendar month. However, grain to date will always start accumulating from the beginning of the grain, regardless of the latest date of data. + +For example, let's consider an underlying measure of `order_total.` + +```yaml + measures: + - name: order_total + agg: sum +``` + +We can compare the difference between a 1-month window and a monthly grain to date. The cumulative metric in a window approach applies a sliding window of 1 month, whereas the grain to date by month resets at the beginning of each month. + +```yaml +metrics: + - name: cumulative_order_total_l1m #For this metric, we use a window of 1 month + label: Cumulative Order total (L1M) + description: Trailing 1 month cumulative order amount + type: cumulative + type_params: + measure: order_total + window: 1 month + - name: cumulative_order_total_mtd #For this metric, we use a monthly grain to date + label: Cumulative Order total (MTD) + description: The month to date value of all orders + type: cumulative + type_params: + measure: order_total + grain_to_date: month +``` + +### Implementation + +The current method connects the metric table to a timespine table using the primary time dimension as the join key. We use the accumulation window in the join to decide whether a record should be included on a particular day. The following SQL code produced from an example cumulative metric is provided for reference: + +``` sql +select + count(distinct distinct_users) as weekly_active_users + , metric_time +from ( + select + subq_3.distinct_users as distinct_users + , subq_3.metric_time as metric_time + from ( + select + subq_2.distinct_users as distinct_users + , subq_1.metric_time as metric_time + from ( + select + metric_time + from transform_prod_schema.mf_time_spine subq_1356 + where ( + metric_time >= cast('2000-01-01' as timestamp) + ) and ( + metric_time <= cast('2040-12-31' as timestamp) + ) + ) subq_1 + inner join ( + select + distinct_users as distinct_users + , date_trunc('day', ds) as metric_time + from demo_schema.transactions transactions_src_426 + where ( + (date_trunc('day', ds)) >= cast('1999-12-26' as timestamp) + ) AND ( + (date_trunc('day', ds)) <= cast('2040-12-31' as timestamp) + ) + ) subq_2 + on + ( + subq_2.metric_time <= subq_1.metric_time + ) and ( + subq_2.metric_time > dateadd(day, -7, subq_1.metric_time) + ) + ) subq_3 +) +group by + metric_time +limit 100 +``` diff --git a/website/docs/docs/build/custom-aliases.md b/website/docs/docs/build/custom-aliases.md index 53671291ef8..326434ea922 100644 --- a/website/docs/docs/build/custom-aliases.md +++ b/website/docs/docs/build/custom-aliases.md @@ -1,5 +1,6 @@ --- title: "Custom aliases" +description: "Read this tutorial to learn how to use custom aliases when building in dbt." id: "custom-aliases" --- @@ -59,6 +60,8 @@ To override dbt's alias name generation, create a macro named `generate_alias_na The default implementation of `generate_alias_name` simply uses the supplied `alias` config (if present) as the model alias, otherwise falling back to the model name. This implementation looks like this: + + ```jinja2 @@ -80,6 +83,45 @@ The default implementation of `generate_alias_name` simply uses the supplied `al + + + + + + +```jinja2 +{% macro generate_alias_name(custom_alias_name=none, node=none) -%} + + {%- if custom_alias_name -%} + + {{ custom_alias_name | trim }} + + {%- elif node.version -%} + + {{ return(node.name ~ "_v" ~ (node.version | replace(".", "_"))) }} + + {%- else -%} + + {{ node.name }} + + {%- endif -%} + +{%- endmacro %} + +``` + + + + + + + +### Managing different behaviors across packages + +See docs on macro `dispatch`: ["Managing different global overrides across packages"](/reference/dbt-jinja-functions/dispatch) + + + ### Caveats #### Ambiguous database identifiers @@ -114,3 +156,21 @@ Compilation Error ``` If these models should indeed have the same database identifier, you can work around this error by configuring a [custom schema](/docs/build/custom-schemas) for one of the models. + +#### Model versions + + + +New in v1.5 + + + + + +**Related documentation:** +- [Model versions](/docs/collaborate/govern/model-versions) +- [`versions`](/reference/resource-properties/versions#alias) + +By default, dbt will create versioned models with the alias `_v`, where `` is that version's unique identifier. You can customize this behavior just like for non-versioned models by configuring a custom `alias` or re-implementing the `generate_alias_name` macro. + + diff --git a/website/docs/docs/build/custom-databases.md b/website/docs/docs/build/custom-databases.md index 75f9ee45d46..dd54d6998e8 100644 --- a/website/docs/docs/build/custom-databases.md +++ b/website/docs/docs/build/custom-databases.md @@ -28,7 +28,7 @@ This config changes all models in the `jaffle_shop` project to be built into a d name: jaffle_shop models: - my_project: + jaffle_shop: +database: jaffle_shop # For BigQuery users: @@ -54,8 +54,6 @@ select * from ... ### generate_database_name -New in v0.16.0 - The database name generated for a model is controlled by a macro called `generate_database_name`. This macro can be overridden in a dbt project to change how dbt generates model database names. This macro works similarly to the [generate_schema_name](/docs/build/custom-schemas#advanced-custom-schema-configuration) macro. To override dbt's database name generation, create a macro named `generate_database_name` in your own dbt project. The `generate_database_name` macro accepts two arguments: @@ -87,6 +85,14 @@ The default implementation of `generate_database_name` simply uses the supplied
+ + +### Managing different behaviors across packages + +See docs on macro `dispatch`: ["Managing different global overrides across packages"](/reference/dbt-jinja-functions/dispatch) + + + ## Considerations ### BigQuery diff --git a/website/docs/docs/build/custom-schemas.md b/website/docs/docs/build/custom-schemas.md index 7d92d727733..b20d4130725 100644 --- a/website/docs/docs/build/custom-schemas.md +++ b/website/docs/docs/build/custom-schemas.md @@ -1,9 +1,9 @@ --- title: "Custom schemas" id: "custom-schemas" +pagination_next: "docs/build/custom-databases" --- -## What is a custom schema? By default, all dbt models are built in the schema specified in your target. In dbt projects with lots of models, it may be useful to instead build some models in schemas other than your target schema – this can help logically group models together. For example, you may wish to: @@ -52,8 +52,7 @@ models: ## Understanding custom schemas -### Why does dbt concatenate the custom schema to the target schema? -When first using custom schemas, it's common to assume that a model will be built in schema that matches the `schema` configuration exactly, for example, a model that has the configuration `schema: marketing`, would be built in the `marketing` schema. However, dbt instead creates it in a schema like `_marketing` by default – there's good reason for this! +When first using custom schemas, it's common to assume that a model will be built in a schema that matches the `schema` configuration exactly, for example, a model that has the configuration `schema: marketing`, would be built in the `marketing` schema. However, dbt instead creates it in a schema like `_marketing` by default – there's a good reason for this! In a typical setup of dbt, each dbt user will use a separate target schema (see [Managing Environments](/docs/build/custom-schemas#managing-environments)). If dbt created models in a schema that matches a model's custom schema exactly, every dbt user would create models in the same schema. @@ -62,7 +61,10 @@ Further, the schema that your development models are built in would be the same If you prefer to use different logic for generating a schema name, you can change the way dbt generates a schema name (see below). ### How does dbt generate a model's schema name? -Under the hood, dbt uses a macro called `generate_schema_name` to determine the name of the schema that a model should be built in. The code for the macro that expresses the current logic follows: + +dbt uses a default macro called `generate_schema_name` to determine the name of the schema that a model should be built in. + +The following code represents the default macro's logic: ```sql {% macro generate_schema_name(custom_schema_name, node) -%} @@ -83,26 +85,63 @@ Under the hood, dbt uses a macro called `generate_schema_name` to determine the ## Advanced custom schema configuration +You can customize schema name generation in dbt depending on your needs, such as creating a custom macro named `generate_schema_name` in your project or using the built-in macro for environment-based schema names. The built-in macro follows a pattern of generating schema names based on the environment, making it a convenient alternative. + +If your dbt project has a macro that’s also named `generate_schema_name`, dbt will always use the macro in your dbt project instead of the default macro. + ### Changing the way dbt generates a schema name -If your dbt project includes a macro that is also named `generate_schema_name`, dbt will _always use the macro in your dbt project_ instead of the default macro. -Therefore, to change the way dbt generates a schema name, you should add a macro named `generate_schema_name` to your project, where you can then define your own logic. +To modify how dbt generates schema names, you should add a macro named `generate_schema_name` to your project and customize it according to your needs: + +- Copy and paste the `generate_schema_name` macro into a file named 'generate_schema_name'. + +- Modify the target schema by either using [target variables](/reference/dbt-jinja-functions/target) or [env_var](/reference/dbt-jinja-functions/env_var). Check out our [Advanced Deployment - Custom Environment and job behavior](https://courses.getdbt.com/courses/advanced-deployment) course video for more details. + +**Note**: dbt will ignore any custom `generate_schema_name` macros included in installed packages. + +
+❗️ Warning: Don't replace default_schema in the macro. + +If you're modifying how dbt generates schema names, don't just replace ```{{ default_schema }}_{{ custom_schema_name | trim }}``` with ```{{ custom_schema_name | trim }}``` in the ```generate_schema_name``` macro. -Note: dbt ignores any custom `generate_schema_name` macros that are part of a package installed in your project. +If you remove ```{{ default_schema }}```, it causes developers to override each other's models if they create their own custom schemas. This can also cause issues during development and continuous integration (CI). + +❌ The following code block is an example of what your code _should not_ look like: +```sql +{% macro generate_schema_name(custom_schema_name, node) -%} + + {%- set default_schema = target.schema -%} + {%- if custom_schema_name is none -%} + + {{ default_schema }} + + {%- else -%} + # The following is incorrect as it omits {{ default_schema }} before {{ custom_schema_name | trim }}. + {{ custom_schema_name | trim }} + + {%- endif -%} + +{%- endmacro %} + +``` +
### An alternative pattern for generating schema names -A frequently used pattern for generating schema names is to change the behavior based on dbt's environment, such that: -- In prod: - - If a custom schema is provided, a model's schema name should match the custom schema, rather than being concatenated to the target schema. - - If no custom schema is provided, a model's schema name should match the target schema. +A common way to generate schema names is by adjusting the behavior according to the environment in dbt. Here's how it works: -- In other environments (e.g. `dev` or `qa`): - - Build _all_ models in the target schema, as in, ignore custom schema configurations. +**Production environment** -dbt ships with a global macro that contains this logic – `generate_schema_name_for_env`. +- If a custom schema is specified, the schema name of a model should match the custom schema, instead of concatenating to the target schema. +- If no custom schema is specified, the schema name of a model should match the target schema. -If you want to use this pattern, you'll need a `generate_schema_name` macro in your project that points to this logic. You can do this by creating a file in your `macros` directory (we normally call it `get_custom_schema.sql`), and pasting in the following: +**Other environments** (like development or quality assurance (QA)): + +- Build _all_ models in the target schema, ignoring any custom schema configurations. + +dbt ships with a global, predefined macro that contains this logic - `generate_schema_name_for_env`. + +If you want to use this pattern, you'll need a `generate_schema_name` macro in your project that points to this logic. You can do this by creating a file in your `macros` directory (typically named `get_custom_schema.sql`), and copying/pasting the following code: @@ -132,33 +171,34 @@ The following context methods _are_ available in the `generate_schema_name` macr | Jinja context | Type | Available | | ------------- | ---- | --------- | -| [target](target) | Variable | ✅ | -| [env_var](env_var) | Variable | ✅ | -| [var](var) | Variable | Limited, see below | -| [exceptions](exceptions) | Macro | ✅ | -| [log](log) | Macro | ✅ | +| [target](/reference/dbt-jinja-functions/target) | Variable | ✅ | +| [env_var](/reference/dbt-jinja-functions/env_var) | Variable | ✅ | +| [var](/reference/dbt-jinja-functions/var) | Variable | Limited, see below | +| [exceptions](/reference/dbt-jinja-functions/exceptions) | Macro | ✅ | +| [log](/reference/dbt-jinja-functions/log) | Macro | ✅ | | Other macros in your project | Macro | ✅ | | Other macros in your packages | Macro | ✅ | -#### Which vars are available in generate_schema_name? +### Which vars are available in generate_schema_name? + +Globally-scoped variables and variables defined on the command line with +[--vars](/docs/build/project-variables) are accessible in the `generate_schema_name` context. - + -Variable semantics have changed in dbt v0.17.0. See the [migration guide](/guides/migration/versions) -for more information on these changes. +### Managing different behaviors across packages - +See docs on macro `dispatch`: ["Managing different global overrides across packages"](/reference/dbt-jinja-functions/dispatch) -Globally-scoped variables and variables defined on the command line with -[--vars](/docs/build/project-variables) are accessible in the `generate_schema_name` context. + ## Managing environments In the `generate_schema_name` macro examples shown above, the `target.name` context variable is used to change the schema name that dbt generates for models. If the `generate_schema_name` macro in your project uses the `target.name` context variable, you must additionally ensure that your different dbt environments are configured appropriately. While you can use any naming scheme you'd like, we typically recommend: - **dev**: Your local development environment; configured in a `profiles.yml` file on your computer. -* **ci**: A [continuous integration](/docs/collaborate/git/connect-github) environment running on Pull Requests in GitHub, GitLab, etc. +* **ci**: A [continuous integration](/docs/cloud/git/connect-github) environment running on Pull Requests in GitHub, GitLab, etc. - **prod**: The production deployment of your dbt project, like in dbt Cloud, Airflow, or [similar](/docs/deploy/deployments). If your schema names are being generated incorrectly, double check your target name in the relevant environment. -For more information, consult the [Managing environments](/docs/collaborate/environments) guide. +For more information, consult the [managing environments in dbt Core](/docs/core/dbt-core-environments) guide. diff --git a/website/docs/docs/build/custom-target-names.md b/website/docs/docs/build/custom-target-names.md index 4e14f36b784..ac7036de572 100644 --- a/website/docs/docs/build/custom-target-names.md +++ b/website/docs/docs/build/custom-target-names.md @@ -2,7 +2,7 @@ title: "Custom target names" id: "custom-target-names" description: "You can define a custom target name for any dbt Cloud job to correspond to settings in your dbt project." - +pagination_next: null --- ## dbt Cloud Scheduler diff --git a/website/docs/docs/build/derived-metrics.md b/website/docs/docs/build/derived-metrics.md new file mode 100644 index 00000000000..fc7961bbe7f --- /dev/null +++ b/website/docs/docs/build/derived-metrics.md @@ -0,0 +1,159 @@ +--- +title: "Derived metrics" +id: derived +description: "Derived metrics is defined as an expression of other metrics.." +sidebar_label: Derived +tags: [Metrics, Semantic Layer] +--- + +In MetricFlow, derived metrics are metrics created by defining an expression using other metrics. They enable you to perform calculations with existing metrics. This is helpful for combining metrics and doing math functions on aggregated columns, like creating a profit metric. + + The parameters, description, and type for derived metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `expr` | The derived expression. | Required | +| `metrics` | The list of metrics used in the derived metrics. | Required | +| `alias` | Optional alias for the metric that you can use in the expr. | Optional | +| `filter` | Optional filter to apply to the metric. | Optional | +| `offset_window` | Set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. | Required | + +The following displays the complete specification for derived metrics, along with an example. + +```yaml +metrics: + - name: the metric name # Required + description: the metric description # Optional + type: derived # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + expr: the derived expression # Required + metrics: # The list of metrics used in the derived metrics # Required + - name: the name of the metrics. must reference a metric you have already defined # Required + alias: optional alias for the metric that you can use in the expr # Optional + filter: optional filter to apply to the metric # Optional + offset_window: set the period for the offset window, such as 1 month. This will return the value of the metric one month from the metric time. # Required +``` + +## Derived metrics example + +```yaml +metrics: + - name: order_gross_profit + description: Gross profit from each order. + type: derived + label: Order Gross Profit + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + - name: order_cost + alias: cost + - name: food_order_gross_profit + label: Food Order Gross Profit + description: "The gross profit for each food order." + type: derived + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_cost + alias: cost + filter: | + {{ Dimension('order__is_food_order') }} = True + - name: order_total_growth_mom + description: "Percentage growth of orders total completed to 1 month ago" + type: derived + label: Order Total Growth % M/M + type_params: + expr: (order_total - order_total_prev_month)*100/order_total_prev_month + metrics: + - name: order_total + - name: order_total + offset_window: 1 month + alias: order_total_prev_month +``` + +## Derived metric offset + +To perform calculations using a metric's value from a previous time period, you can add an offset parameter to a derived metric. For example, if you want to calculate period-over-period growth or track user retention, you can use this metric offset. + +**Note:** You must include the [`metric_time` dimension](/docs/build/dimensions#time) when querying a derived metric with an offset window. + +The following example displays how you can calculate monthly revenue growth using a 1-month offset window: + +```yaml +- name: customer_retention + description: Percentage of customers that are active now and those active 1 month ago + label: customer_retention + type_params: + expr: (active_customers/ active_customers_prev_month) + metrics: + - name: active_customers + alias: current_active_customers + - name: active_customers + offset_window: 1 month + alias: active_customers_prev_month +``` + +### Offset windows and granularity + +You can query any granularity and offset window combination. The following example queries a metric with a 7-day offset and a monthly grain: + +```yaml +- name: d7_booking_change + description: Difference between bookings now and 7 days ago + type: derived + label: d7 Bookings Change + type_params: + expr: bookings - bookings_7_days_ago + metrics: + - name: bookings + alias: current_bookings + - name: bookings + offset_window: 7 days + alias: bookings_7_days_ago +``` + +When you run the query `dbt sl query --metrics d7_booking_change --group-by metric_time__month` for the metric, here's how it's calculated. For dbt Core, you can use the `mf query` prefix. + +1. We retrieve the raw, unaggregated dataset with the specified measures and dimensions at the smallest level of detail, which is currently 'day'. +2. Then, we perform an offset join on the daily dataset, followed by performing a date trunc and aggregation to the requested granularity. + For example, to calculate `d7_booking_change` for July 2017: + - First, we sum up all the booking values for each day in July to calculate the bookings metric. + - The following table displays the range of days that make up this monthly aggregation. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 330 | 2017-07-31 | +| | 7030 | 2017-07-30 to 2017-07-02 | +| | 78 | 2017-07-01 | +| Total | 7438 | 2017-07-01 | + +3. Next, we calculate July's bookings with a 7-day offset. The following table displays the range of days that make up this monthly aggregation. Note that the month begins 7 days later (offset by 7 days) on 2017-07-24. + +| | Orders | Metric_time | +| - | ---- | -------- | +| | 329 | 2017-07-24 | +| | 6840 | 2017-07-23 to 2017-06-30 | +| | 83 | 2017-06-24 | +| Total | 7252 | 2017-07-01 | + +4. Lastly, we calculate the derived metric and return the final result set: + +```bash +bookings - bookings_7_days_ago would be compile as 7438 - 7252 = 186. +``` + +| d7_booking_change | metric_time__month | +| ----------------- | ------------------ | +| 186 | 2017-07-01 | diff --git a/website/docs/docs/build/dimensions.md b/website/docs/docs/build/dimensions.md new file mode 100644 index 00000000000..b8679fe11b0 --- /dev/null +++ b/website/docs/docs/build/dimensions.md @@ -0,0 +1,368 @@ +--- +title: Dimensions +id: dimensions +description: "Dimensions determine the level of aggregation for a metric, and are non-aggregatable expressions." +sidebar_label: "Dimensions" +tags: [Metrics, Semantic Layer] +--- + +Dimensions is a way to group or filter information based on categories or time. It's like a special label that helps organize and analyze data. + +In a data platform, dimensions is part of a larger structure called a semantic model. It's created along with other elements like [entities](/docs/build/entities) and [measures](/docs/build/measures), and used to add more details to your data that can't be easily added up or combined. In SQL, dimensions is typically included in the `group by` clause of your SQL query. + + + + +All dimensions require a `name`, `type` and in some cases, an `expr` parameter. + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | Refers to the name of the group that will be visible to the user in downstream tools. It can also serve as an alias if the column name or SQL query reference is different and provided in the `expr` parameter.

Dimension names should be unique within a semantic model, but they can be non-unique across different models as MetricFlow uses [joins](/docs/build/join-logic) to identify the right dimension. | Required | +| `type` | Specifies the type of group created in the semantic model. There are three types:

- **Categorical**: Group rows in a table by categories like geography, color, and so on.
- **Time**: Point to a date field in the data platform. Must be of type TIMESTAMP or equivalent in the data platform engine.
- **Slowly-changing dimensions**: Analyze metrics over time and slice them by groups that change over time, like sales trends by a customer's country. | Required | +| `type_params` | Specific type params such as if the time is primary or used as a partition | Required | +| `description` | A clear description of the dimension | Optional | +| `expr` | Defines the underlying column or SQL query for a dimension. If no `expr` is specified, MetricFlow will use the column with the same name as the group. You can use column name itself to input a SQL expression. | Optional | + +Refer to the following for the complete specification for dimensions: + +```yaml +dimensions: + - name: name of the group that will be visible to the user in downstream tools + type: Categorical or Time + type_params: specific type params such as if the time is primary or used as a partition + description: same as always + expr: the column name or expression. If not provided the default is the dimension name +``` + +Refer to the following example to see how dimensions are used in a semantic model: + +```yaml +semantic_models: + - name: transactions + description: A record for every transaction that takes place. Carts are considered multiple transactions for each SKU. + model: {{ ref("fact_transactions") }} + defaults: + agg_time_dimension: metric_time +# --- entities --- + entities: + ... +# --- measures --- + measures: + ... +# --- dimensions --- + dimensions: + - name: metric_time + type: time + expr: date_trunc('day', ts) + - name: is_bulk_transaction + type: categorical + expr: case when quantity > 10 then true else false end +``` + +MetricFlow requires that all dimensions have a primary entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. + +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id +``` + +## Dimensions types + +Dimensions have 2 types. This section further explains the definitions and provides examples. + +- [Dimensions types](#dimensions-types) + - [Categorical](#categorical) + - [Time](#time) + - [SCD Type II](#scd-type-ii) + +### Categorical + +Categorical is used to group metrics by different categories such as product type, color, or geographical area. They can refer to existing columns in your dbt model or be calculated using a SQL expression with the `expr` parameter. An example of a category dimension is `is_bulk_transaction`, which is a group created by applying a case statement to the underlying column `quantity`. This allows users to group or filter the data based on bulk transactions. + +```yaml +dimensions: + - name: is_bulk_transaction + type: categorical + expr: case when quantity > 10 then true else false end +``` + +### Time + + +:::tip use datetime data type if using BigQuery +To use BigQuery as your data platform, time dimensions columns need to be in the datetime data type. If they are stored in another type, you can cast them to datetime using the `expr` property. Time dimensions are used to group metrics by different levels of time, such as day, week, month, quarter, and year. MetricFlow supports these granularities, which can be specified using the `time_granularity` parameter. +::: + +Time has additional parameters specified under the `type_params` section. When you query one or more metrics in MetricFlow using the CLI, the default time dimension for a single metric is the aggregation time dimension, which you can refer to as `metric_time` or use the dimensions' name. + +You can use multiple time groups in separate metrics. For example, the `users_created` metric uses `created_at`, and the `users_deleted` metric uses `deleted_at`: + + +```bash +# dbt Cloud users +dbt sl query --metrics users_created,users_deleted --dimensions metric_time --order metric_time + +# dbt Core users +mf query --metrics users_created,users_deleted --dimensions metric_time --order metric_time +``` + + +You can set `is_partition` for time or categorical dimensions to define specific time spans. Additionally, use the `type_params` section to set `time_granularity` to adjust aggregation detail (like daily, weekly, and so on): + + + + + +Use `is_partition: True` to show that a dimension exists over a specific time window. For example, a date-partitioned dimensional table. When you query metrics from different tables, the dbt Semantic Layer uses this parameter to ensure that the correct dimensional values are joined to measures. + +You can also use `is_partition` for [categorical](#categorical) dimensions as well. + +MetricFlow enables metric aggregation during query time. For example, you can aggregate the `messages_per_month` measure. If you originally had a `time_granularity` for the time dimensions `metric_time`, you can specify a yearly granularity for aggregation in your query: + +```bash +# dbt Cloud users +dbt sl query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year + +# dbt Core users +mf query --metrics messages_per_month --dimensions metric_time --order metric_time --time-granularity year +``` + +```yaml +dimensions: + - name: created_at + type: time + expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + - name: deleted_at + type: time + expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + +measures: + - name: users_deleted + expr: 1 + agg: sum + agg_time_dimension: deleted_at + - name: users_created + expr: 1 + agg: sum +``` + + + + + +`time_granularity` specifies the smallest level of detail that a measure or metric should be reported at, such as daily, weekly, monthly, quarterly, or yearly. Different granularity options are available, and each metric must have a specified granularity. For example, a metric that is specified with weekly granularity couldn't be aggregated to a daily grain. + +The current options for time granularity are day, week, month, quarter, and year. + +Aggregation between metrics with different granularities is possible, with the Semantic Layer returning results at the highest granularity by default. For example, when querying two metrics with daily and monthly granularity, the resulting aggregation will be at the monthly level. + +```yaml +dimensions: + - name: created_at + type: time + expr: date_trunc('day', ts_created) #ts_created is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + - name: deleted_at + type: time + expr: date_trunc('day', ts_deleted) #ts_deleted is the underlying column name from the table + is_partition: True + type_params: + time_granularity: day + +measures: + - name: users_deleted + expr: 1 + agg: sum + agg_time_dimension: deleted_at + - name: users_created + expr: 1 + agg: sum +``` + + + + + +### SCD Type II + +:::caution +Currently, there are limitations in supporting SCD's. +::: + +MetricFlow supports joins against dimensions values in a semantic model built on top of an SCD Type II table (slowly changing dimension) Type II table. This is useful when you need a particular metric sliced by a group that changes over time, such as the historical trends of sales by a customer's country. + +As their name suggests SCD Type II are groups that change values at a coarser time granularity. This results in a range of valid rows with different dimensions values for a given metric or measure. MetricFlow associates the metric with the first (minimum) available dimensions value within a coarser time window, such as month. By default, MetricFlow uses the group that is valid at the beginning of the time granularity. + +The following basic structure of an SCD Type II data platform table is supported: + +| entity_key | dimensions_1 | dimensions_2 | ... | dimensions_x | valid_from | valid_to | +|------------|-------------|-------------|-----|-------------|------------|----------| + +* `entity_key` (required): An entity_key (or some sort of identifier) must be present +* `valid_from` (required): A timestamp indicating the start of a changing dimensions value must be present +* `valid_to` (required): A timestamp indicating the end of a changing dimensions value must be present + +**Note**: The SCD dimensions table must have `valid_to` and `valid_from` columns. + +This is an example of SQL code that shows how a sample metric called `num_events` is joined with versioned dimensions data (stored in a table called `scd_dimensions`) using a primary key made up of the `entity_key` and `timestamp` columns. + + +```sql +select metric_time, dimensions_1, sum(1) as num_events +from events a +left outer join scd_dimensions b +on + a.entity_key = b.entity_key + and a.metric_time >= b.valid_from + and (a.metric_time < b. valid_to or b.valid_to is null) +group by 1, 2 +``` + + + + + +This example shows how to create slowly changing dimensions (SCD) using a semantic model. The SCD table contains information about sales persons' tier and the time length of that tier. Suppose you have the underlying SCD table: + +| sales_person_id | tier | start_date | end_date | +|-----------------|------|------------|----------| +| 111 | 1 | 2019-02-03 | 2020-01-05| +| 111 | 2 | 2020-01-05 | 2048-01-01| +| 222 | 2 | 2020-03-05 | 2048-01-01| +| 333 | 2 | 2020-08-19 | 2021-10-22| +| 333 | 3 | 2021-10-22 | 2048-01-01| + +Take note of the extra arguments under `validity_params`: `is_start` and `is_end`. These arguments indicate the columns in the SCD table that contain the start and end dates for each tier (or beginning or ending timestamp column for a dimensional value). + +```yaml +semantic_models: + - name: sales_person_tiers + description: SCD Type II table of tiers for salespeople + model: {{ref(sales_person_tiers)}} + defaults: + agg_time_dimension: tier_start + + dimensions: + - name: tier_start + type: time + expr: start_date + type_params: + time_granularity: day + validity_params: + is_start: True + - name: tier_end + type: time + expr: end_date + type_params: + time_granularity: day + validity_params: + is_end: True + - name: tier + type: categorical + + entities: + - name: sales_person + type: primary + expr: sales_person_id +``` + +The following code represents a separate semantic model that holds a fact table for `transactions`: + +```yaml +semantic_models: + - name: transactions + description: | + Each row represents one transaction. + There is a transaction, product, sales_person, and customer id for + every transaction. There is only one transaction id per + transaction. The `metric_time` or date is reflected in UTC. + model: {{ ref(fact_transactions) }} + defaults: + agg_time_dimension: metric_time + + entities: + - name: transaction_id + type: primary + - name: customer + type: foreign + expr: customer_id + - name: product + type: foreign + expr: product_id + - name: sales_person + type: foreign + expr: sales_person_id + + measures: + - name: transactions + expr: 1 + agg: sum + - name: gross_sales + expr: sales_price + agg: sum + - name: sales_persons_with_a_sale + expr: sales_person_id + agg: count_distinct + + dimensions: + - name: metric_time + type: time + is_partition: true + type_params: + time_granularity: day + - name: sales_geo + type: categorical +``` + +You can now access the metrics in the `transactions` semantic model organized by the slowly changing dimension of `tier`. + +In the sales tier example, For instance, if a salesperson was Tier 1 from 2022-03-01 to 2022-03-12, and gets promoted to Tier 2 from 2022-03-12 onwards, all transactions from March would be categorized under Tier 1 since the dimensions value of Tier 1 comes earlier (and is the default starting point), even though the salesperson was promoted to Tier 2 on 2022-03-12. + + + + + +This example shows how to create slowly changing dimensions (SCD) using a semantic model. The SCD table contains information about salespersons' tier and the time length of that tier. Suppose you have the underlying SCD table: + +| sales_person_id | tier | start_date | end_date | +|-----------------|------|------------|----------| +| 111 | 1 | 2019-02-03 | 2020-01-05| +| 111 | 2 | 2020-01-05 | 2048-01-01| +| 222 | 2 | 2020-03-05 | 2048-01-01| +| 333 | 2 | 2020-08-19 | 2021-10-22| +| 333 | 3 | 2021-10-22 | 2048-01-01| + +In the sales tier example, if sales_person_id 456 is Tier 2 from 2022-03-08 onwards, but there is no associated tier level dimension for this person from 2022-03-01 to 2022-03-08, then all transactions associated with sales_person_id 456 for the month of March will be grouped under 'NA' since no tier is present prior to Tier 2. + +The following command or code represents how to return the count of transactions generated by each sales tier per month: + +```bash +# dbt Cloud users +dbt sl query --metrics transactions --dimensions metric_time__month,sales_person__tier --order metric_time__month --order sales_person__tier + +# dbt Core users +mf query --metrics transactions --dimensions metric_time__month,sales_person__tier --order metric_time__month --order sales_person__tier + +``` + + + diff --git a/website/docs/docs/build/enhance-your-code.md b/website/docs/docs/build/enhance-your-code.md new file mode 100644 index 00000000000..5f2d48f6f5a --- /dev/null +++ b/website/docs/docs/build/enhance-your-code.md @@ -0,0 +1,38 @@ +--- +title: "Enhance your code" +description: "Learn how you can enhance your code" +pagination_next: "docs/build/project-variables" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/build/enhance-your-models.md b/website/docs/docs/build/enhance-your-models.md new file mode 100644 index 00000000000..46e7fa74353 --- /dev/null +++ b/website/docs/docs/build/enhance-your-models.md @@ -0,0 +1,23 @@ +--- +title: "Enhance your models" +description: "Learn how you can enhance your models" +pagination_next: "docs/build/materializations" +pagination_prev: null +--- + +
+ + + + + +
+
\ No newline at end of file diff --git a/website/docs/docs/build/entities.md b/website/docs/docs/build/entities.md new file mode 100644 index 00000000000..464fa2c3b8c --- /dev/null +++ b/website/docs/docs/build/entities.md @@ -0,0 +1,56 @@ +--- +title: Entities +id: entities +description: "Entities are real-world concepts that correspond to key parts of your business, such as customers, transactions, and ad campaigns." +sidebar_label: "Entities" +tags: [Metrics, Semantic Layer] +--- + +Entities are real-world concepts in a business such as customers, transactions, and ad campaigns. We often focus our analyses around specific entities, such as customer churn or annual recurring revenue modeling. We represent entities in our semantic models using id columns that serve as join keys to other semantic models in your semantic graph. + +Within a semantic graph, the required parameters for an entity are `name` and `type`. The `name` refers to either the key column name from the underlying data table, or it may serve as an alias with the column name referenced in the `expr` parameter. + +Entities can be specified with a single column or multiple columns. Entities (join keys) in a semantic model are identified by their name. Each entity name must be unique within a semantic model, but it doesn't have to be unique across different semantic models. + +There are four entity types: primary, foreign, unique, or natural. + +:::tip Use entities as a dimensions +You can also use entities as a dimensions, which allows you to aggregate a metric to the granularity of that entity. +::: + + +## Entity types + +MetricFlow's join logic depends on the entity `type` you use, and it also determines how to join semantic models. Refer to [Joins](/docs/build/join-logic) for more info on how to construct joins. + +* **Primary —** A primary key has **only one** record for each row in the table, and it includes every record in the data platform. +* **Unique —** A unique key contains **only one** record per row in the table, but it may have a subset of records in the data warehouse. It can also include nulls. +* **Foreign —** A foreign key can include zero, one, or multiple instances of the same record. Null values may also be present. +* **Natural —** Natural keys are column or combination of columns in a table that uniquely identify a record based on real-world data. For instance, in a sales_person_department dimension table, the sales_person_id can serve as a natural key. + +The complete spec for entities is below: +```yaml +entities: + - name: transaction ## Required + type: primary or natural or foreign or unique ## Required + description: a description of the field or role the entity takes in this table ## Optional + expr: the field that denotes that entity (transaction_id). ## Optional + If not specified will default to name + +``` + +Here's an example of how to define entities in a semantic model: + +``` yaml +entities: + - name: transaction + type: primary + expr: id_transaction + - name: order + type: foreign + expr: id_order + - name: user + type: foreign + expr: substring(id_order from 2) +``` + diff --git a/website/docs/docs/build/environment-variables.md b/website/docs/docs/build/environment-variables.md index c6a2517c9a2..55d3fd19c6c 100644 --- a/website/docs/docs/build/environment-variables.md +++ b/website/docs/docs/build/environment-variables.md @@ -44,7 +44,7 @@ To the right of the `Project Default` column are all your environments. Values s **Overriding environment variables at the job level** -You may have multiple jobs that run in the same environment, and you'd like the environment variable to be interpretted differently depending on the job. +You may have multiple jobs that run in the same environment, and you'd like the environment variable to be interpreted differently depending on the job. When setting up or editing a job, you will see a section where you can override environment variable values defined at the environment or project level. @@ -121,13 +121,26 @@ Environment variables can be used in many ways, and they give you the power and Now that you can set secrets as environment variables, you can pass git tokens into your package HTTPS URLs to allow for on-the-fly cloning of private repositories. Read more about enabling [private package cloning](/docs/build/packages#private-packages). #### Dynamically set your warehouse in your Snowflake connection -Environment variables make it possible to dynamically change the Snowflake virtual warehouse size depending on the job. Instead of calling the warehouse name directly in your project connection, you can reference an environment variable which will get set to a specific virtual warehouse at runtime. +Environment variables make it possible to dynamically change the Snowflake virtual warehouse size depending on the job. Instead of calling the warehouse name directly in your project connection, you can reference an environment variable which will get set to a specific virtual warehouse at runtime. For example, suppose you'd like to run a full-refresh job in an XL warehouse, but your incremental job only needs to run in a medium-sized warehouse. Both jobs are configured in the same dbt Cloud environment. In your connection configuration, you can use an environment variable to set the warehouse name to `{{env_var('DBT_WAREHOUSE')}}`. Then in the job settings, you can set a different value for the `DBT_WAREHOUSE` environment variable depending on the job's workload. +Currently, it's not possible to dynamically set environment variables across models within a single run. This is because each env_var can only have a single set value for the entire duration of the run. + +**Note** — You can also use this method with Databricks SQL Warehouse. +:::info Environment variables and Snowflake OAuth limitations +Env vars works fine with username/password and keypair, including scheduled jobs, because dbt Core consumes the Jinja inserted into the autogenerated `profiles.yml` and resolves it to do an `env_var` lookup. + +However, there are some limitations when using env vars with Snowflake OAuth Connection settings: + +- You can't use them in the account/host field, but they can be used for database, warehouse, and role. + +Something to note, if you supply an environment variable in the account/host field, Snowflake OAuth Connection will **fail** to connect. This happens because the field doesn't pass through Jinja rendering, so dbt Cloud simply passes the literal `env_var` code into a URL string like `{{ env_var("DBT_ACCOUNT_HOST_NAME") }}.snowflakecomputing.com`, which is an invalid hostname. +::: + #### Audit your run metadata Here's another motivating example that uses the dbt Cloud run ID, which is set automatically at each run. This additional data field can be used for auditing and debugging: diff --git a/website/docs/docs/build/exposures.md b/website/docs/docs/build/exposures.md index 2c06c5f4506..65c0792e0a0 100644 --- a/website/docs/docs/build/exposures.md +++ b/website/docs/docs/build/exposures.md @@ -1,29 +1,19 @@ --- -title: "Exposures" +title: "Add Exposures to your DAG" +sidebar_label: "Exposures" id: "exposures" --- - - -* **v0.18.1**: Exposures are new! -* **v0.20.0**: Exposures support `tags` and `meta` properties - - - -## Related documentation -* [Exposure properties](exposure-properties) -* [`exposure:` selection method](node-selection/methods#the-exposure-method) - -## Overview - Exposures make it possible to define and describe a downstream use of your dbt project, such as in a dashboard, application, or data science pipeline. By defining exposures, you can then: - run, test, and list resources that feed into your exposure -- populate a dedicated page in the auto-generated [documentation](documentation) site with context relevant to data consumers +- populate a dedicated page in the auto-generated [documentation](/docs/collaborate/documentation) site with context relevant to data consumers ### Declaring an exposure Exposures are defined in `.yml` files nested under an `exposures:` key. + + ```yaml @@ -32,6 +22,7 @@ version: 2 exposures: - name: weekly_jaffle_metrics + label: Jaffles by the Week type: dashboard maturity: high url: https://bi.tool/dashboards/1 @@ -42,28 +33,72 @@ exposures: - ref('fct_orders') - ref('dim_customers') - source('gsheets', 'goals') + - metric('count_orders') owner: - name: Claire from Data + name: Callum McData email: data@jaffleshop.com ``` + + + + + + +```yaml +version: 2 + +exposures: + + - name: weekly_jaffle_report + type: dashboard + maturity: high + url: https://bi.tool/dashboards/1 + description: > + Did someone say "exponential growth"? + + depends_on: + - ref('fct_orders') + - ref('dim_customers') + - source('gsheets', 'goals') + + owner: + name: Callum McData + email: data@jaffleshop.com +``` + + + + + ### Available properties _Required:_ -- **name** (must be unique among exposures and you must use the [snake case](https://en.wikipedia.org/wiki/Snake_case) naming convention) +- **name**: a unique exposure name written in [snake case](https://en.wikipedia.org/wiki/Snake_case) - **type**: one of `dashboard`, `notebook`, `analysis`, `ml`, `application` (used to organize in docs site) -- **owner**: email +- **owner**: `name` or `email` required; additional properties allowed + + + +_Expected:_ +- **depends_on**: list of refable nodes, including `ref`, `source`, and `metric` (While possible, it is highly unlikely you will ever need an `exposure` to depend on a `source` directly) + + + + _Expected:_ -- **depends_on**: list of refable nodes (`ref` + `source`) +- **depends_on**: list of refable nodes, including `ref` and `source` (While possible, it is highly unlikely you will ever need an `exposure` to depend on a `source` directly) + + _Optional:_ +- **label**: may contain spaces, capital letters, or special characters. - **url**: enables the link to **View this exposure** in the upper right corner of the generated documentation site - **maturity**: one of `high`, `medium`, `low` -- **owner**: name _General properties (optional)_ - **description** @@ -76,11 +111,18 @@ We plan to add more subtypes and optional properties in future releases. Once an exposure is defined, you can run commands that reference it: ``` -dbt run -s +exposure:weekly_jaffle_metrics -dbt test -s +exposure:weekly_jaffle_metrics +dbt run -s +exposure:weekly_jaffle_report +dbt test -s +exposure:weekly_jaffle_report + ``` When we generate our documentation site, you'll see the exposure appear: + +## Related docs + +* [Exposure properties](/reference/exposure-properties) +* [`exposure:` selection method](/reference/node-selection/methods#the-exposure-method) +* [Dashboard status tiles](/docs/deploy/dashboard-status-tiles) diff --git a/website/docs/docs/build/groups.md b/website/docs/docs/build/groups.md new file mode 100644 index 00000000000..d4fda045277 --- /dev/null +++ b/website/docs/docs/build/groups.md @@ -0,0 +1,128 @@ +--- +title: "Add groups to your DAG" +sidebar_label: "Groups" +id: "groups" +description: "When you define groups in dbt projects, you turn implicit relationships into an explicit grouping." +keywords: + - groups access mesh +--- + +:::info New functionality +This functionality is new in v1.5. +::: + +## Related docs + +* [Model Access](/docs/collaborate/govern/model-access#groups) +* [Group configuration](/reference/resource-configs/group) +* [Group selection](/reference/node-selection/methods#the-group-method) + +## About groups + +A group is a collection of nodes within a dbt DAG. Groups are named, and every group has an `owner`. They enable intentional collaboration within and across teams by restricting [access to private](/reference/resource-configs/access) models. + +Group members may include models, tests, seeds, snapshots, analyses, and metrics. (Not included: sources and exposures.) Each node may belong to only one group. + +### Declaring a group + +Groups are defined in `.yml` files, nested under a `groups:` key. + + + +```yaml +groups: + - name: finance + owner: + # 'name' or 'email' is required; additional properties allowed + email: finance@jaffleshop.com + slack: finance-data + github: finance-data-team +``` + + + +### Adding a model to a group + +Use the `group` configuration to add one or more models to a group. + + + + + + +```yml +models: + marts: + finance: + +group: finance +``` + + + + + + + + + +```yml +models: + - name: model_name + config: + group: finance +``` + + + + + + + + + +```sql +{{ config(group = 'finance') }} + +select ... +``` + + + + + + + +### Referencing a model in a group + +By default, all models within a group have the `protected` [access modifier](/reference/resource-configs/access). This means they can be referenced by downstream resources in _any_ group in the same project, using the [`ref`](/reference/dbt-jinja-functions/ref) function. If a grouped model's `access` property is set to `private`, only resources within its group can reference it. + + + +```yml +models: + - name: finance_private_model + access: private + config: + group: finance + + # in a different group! + - name: marketing_model + config: + group: marketing +``` + + + + +```sql +select * from {{ ref('finance_model') }} +``` + + +```shell +$ dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` diff --git a/website/docs/docs/build/hooks-operations.md b/website/docs/docs/build/hooks-operations.md index 524a75aaf9c..85378498a36 100644 --- a/website/docs/docs/build/hooks-operations.md +++ b/website/docs/docs/build/hooks-operations.md @@ -1,17 +1,20 @@ --- title: "Hooks and operations" +description: "Read this tutorial to learn how to use hooks and operations when building in dbt." id: "hooks-operations" --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ## Related documentation -* [pre-hook & post-hook](pre-hook-post-hook) -* [on-run-start & on-run-end](on-run-start-on-run-end) -* [`run-operation` command](run-operation) +* [pre-hook & post-hook](/reference/resource-configs/pre-hook-post-hook) +* [on-run-start & on-run-end](/reference/project-configs/on-run-start-on-run-end) +* [`run-operation` command](/reference/commands/run-operation) ### Assumed knowledge -* [Project configurations](reference/dbt_project.yml.md) -* [Model configurations](model-configs) -* [Macros](jinja-macros#macros) +* [Project configurations](/reference/dbt_project.yml.md) +* [Model configurations](/reference/model-configs) +* [Macros](/docs/build/jinja-macros#macros) ## Getting started with hooks and operations @@ -32,12 +35,12 @@ dbt provides hooks and operations so you can version control and execute these s Hooks are snippets of SQL that are executed at different times: * `pre-hook`: executed _before_ a model, seed or snapshot is built. * `post-hook`: executed _after_ a model, seed or snapshot is built. - * `on-run-start`: executed at the _start_ of `dbt run`, `dbt seed` or `dbt snapshot` - * `on-run-end`: executed at the _end_ of `dbt run`, `dbt seed` or `dbt snapshot` + * `on-run-start`: executed at the _start_ of + * `on-run-end`: executed at the _end_ of Hooks are a more-advanced capability that enable you to run custom SQL, and leverage database-specific actions, beyond what dbt makes available out-of-the-box with standard materializations and configurations. - + @@ -50,7 +53,7 @@ If (and only if) you can't leverage the [`grants` resource-config](/reference/re You can use hooks to trigger actions at certain times when running an operation or building a model, seed, or snapshot. -For more information about when hooks can be triggered, see reference sections for [`on-run-start` and `on-run-end` hooks](on-run-start-on-run-end) and [`pre-hook`s and `post-hook`s](pre-hook-post-hook). +For more information about when hooks can be triggered, see reference sections for [`on-run-start` and `on-run-end` hooks](/reference/project-configs/on-run-start-on-run-end) and [`pre-hook`s and `post-hook`s](/reference/resource-configs/pre-hook-post-hook). You can use hooks to provide database-specific functionality not available out-of-the-box with dbt. For example, you can use a `config` block to run an `ALTER TABLE` statement right after building an individual model using a `post-hook`: @@ -67,139 +70,18 @@ You can use hooks to provide database-specific functionality not available out-o
- - - - -### Examples using hooks - -Here's a minimal example of using hooks to grant privileges. For more information, see [`on-run-start` & `on-run-end` hooks](on-run-start-on-run-end) and [`pre-hook` & `post-hook`](pre-hook-post-hook) reference sections. - - - -```yml -on-run-end: - - "grant usage on {{ target.schema }} to role reporter" - -models: - +post-hook: - - "grant select on {{ this }} to role reporter" - -``` - - - -You can also apply the `post-hook` to individual models using a `config` block: - - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -You should use database-specific syntax when appropriate: - - - -
- - - -```sql -{{ config( - post_hook=[ - 'grant `roles/bigquery.dataViewer` on {{ this.type }} {{ this }} to "user:someone@yourcompany.com"' - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to `someone@yourcompany.com`" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to reporter" - ] -) }} - -select ... - -``` - - - -
- -
- - - -```sql -{{ config( - post_hook=[ - "grant select on {{ this }} to role reporter" - ] -) }} - -select ... - -``` - - - -
- -
-
### Calling a macro in a hook -You can also use a [macro](jinja-macros#macros) to bundle up hook logic. Check out some of the examples in the reference sections for [on-run-start and on-run-end hooks](on-run-start-on-run-end) and [pre- and post-hooks](pre-hook-post-hook). +You can also use a [macro](/docs/build/jinja-macros#macros) to bundle up hook logic. Check out some of the examples in the reference sections for [on-run-start and on-run-end hooks](/reference/project-configs/on-run-start-on-run-end) and [pre- and post-hooks](/reference/resource-configs/pre-hook-post-hook). ## About operations -Operations are [macros](jinja-macros#macros) that you can run using the [`run-operation` command](run-operation) command. As such, operations aren't actually a separate resource in your dbt project — they are just a convenient way to invoke a macro without needing to run a model. +Operations are [macros](/docs/build/jinja-macros#macros) that you can run using the [`run-operation`](/reference/commands/run-operation) command. As such, operations aren't actually a separate resource in your dbt project — they are just a convenient way to invoke a macro without needing to run a model. :::info Explicitly execute the SQL in an operation -Unlike hooks, you need to explicitly execute a query within a macro, by using either a [statement block](statement-blocks) or a helper macro like the [run_query macro](run_query) macro. Otherwise, dbt will return the query as a string without executing it. +Unlike hooks, you need to explicitly execute a query within a macro, by using either a [statement block](/reference/dbt-jinja-functions/statement-blocks) or a helper macro like the [run_query](/reference/dbt-jinja-functions/run_query) macro. Otherwise, dbt will return the query as a string without executing it. ::: This macro performs a similar action as the above hooks: @@ -231,7 +113,7 @@ Privileges granted ``` -Full usage docs can for the `run-operation` command can be found [here](run-operation). +Full usage docs for the `run-operation` command can be found [here](/reference/commands/run-operation). ## Additional examples diff --git a/website/docs/docs/build/incremental-models.md b/website/docs/docs/build/incremental-models.md index 08911764e32..07a571cd4db 100644 --- a/website/docs/docs/build/incremental-models.md +++ b/website/docs/docs/build/incremental-models.md @@ -1,10 +1,9 @@ --- title: "Incremental models" +description: "Read this tutorial to learn how to use incremental models when building in dbt." id: "incremental-models" --- -## Overview - Incremental models are built as tables in your . The first time a model is run, the is built by transforming _all_ rows of source data. On subsequent runs, dbt transforms _only_ the rows in your source data that you tell dbt to filter for, inserting them into the target table which is the table that has already been built. Often, the rows you filter for on an incremental run will be the rows in your source data that have been created or updated since the last time dbt ran. As such, on each dbt run, your model gets built incrementally. @@ -27,14 +26,16 @@ select ... To use incremental models, you also need to tell dbt: -* How to filter the rows on an incremental run. -* The uniqueness constraint of the model (if any). +* How to filter the rows on an incremental run +* The unique key of the model (if any) ### Filtering rows on an incremental run To tell dbt which rows it should transform on an incremental run, wrap valid SQL that filters for these rows in the `is_incremental()` macro. -Often, you'll want to filter for "new" rows, as in, rows that have been created since the last time dbt ran this model. The best way to find the timestamp of the most recent run of this model is by checking the most recent timestamp in your target table. dbt makes it easy to query your target table by using the "[{{ this }}](this)" variable. +Often, you'll want to filter for "new" rows, as in, rows that have been created since the last time dbt ran this model. The best way to find the timestamp of the most recent run of this model is by checking the most recent timestamp in your target table. dbt makes it easy to query your target table by using the "[{{ this }}](/reference/dbt-jinja-functions/this)" variable. + +Also common is wanting to capture both new and updated records. For updated records, you'll need to [define a unique key](#defining-a-unique-key-optional) to ensure you don't bring in modified records as duplicates. Your `is_incremental()` code will check for rows created *or modified* since the last time dbt ran this model. For example, a model that includes a computationally slow transformation on a column can be built incrementally, as follows: @@ -56,6 +57,7 @@ from raw_app_data.events {% if is_incremental() %} -- this filter will only be applied on an incremental run + -- (uses > to include records whose timestamp occurred since the last run of this model) where event_time > (select max(event_time) from {{ this }}) {% endif %} @@ -71,35 +73,33 @@ For more complex incremental models that make use of Common Table Expressions (C ::: -### Defining a uniqueness constraint (optional) - -A `unique_key` determines whether a record has new values and should be updated. By using `unique_key`, you can ensure that each row from the source table is represented by a single row in your incremental model, without duplicates. Not specifying a `unique_key` will result in append-only behavior, which means dbt inserts all rows returned by the model's SQL into the preexisting target table without regard for whether the rows represent duplicates. +### Defining a unique key (optional) - - -This optional parameter for incremental models specifies a field that can uniquely identify each row within your model. You can define `unique_key` in a configuration block at the top of your model. If your model doesn't contain a single field that is unique, but rather a combination of columns, we recommend that you create a single column that can serve as unique identifier (by concatenating and hashing those columns), and pass it into your model's configuration. - - +A `unique_key` enables updating existing rows instead of just appending new rows. If new information arrives for an existing `unique_key`, that new information can replace the current information instead of being appended to the table. If a duplicate row arrives, it can be ignored. Refer to [strategy specific configs](#strategy-specific-configs) for more options on managing this update behavior, like choosing only specific columns to update. - +Not specifying a `unique_key` will result in append-only behavior, which means dbt inserts all rows returned by the model's SQL into the preexisting target table without regard for whether the rows represent duplicates. -This optional parameter for incremental models specifies a field (or combination of fields) that can uniquely identify each row within your model. You can define `unique_key` in a configuration block at the top of your model, and it can be a list in addition to a single column name. +The optional `unique_key` parameter specifies a field (or combination of fields) that define the grain of your model. That is, the field(s) identify a single unique row. You can define `unique_key` in a configuration block at the top of your model, and it can be a single column name or a list of column names. -The `unique_key` should be supplied in your model definition as a string representing a simple column or a list of single quoted column names that can be used together, for example, `['col1', 'col2', …])`. +The `unique_key` should be supplied in your model definition as a string representing a single column or a list of single-quoted column names that can be used together, for example, `['col1', 'col2', …])`. Columns used in this way should not contain any nulls, or the incremental model run may fail. Either ensure that each column has no nulls (for example with `coalesce(COLUMN_NAME, 'VALUE_IF_NULL')`), or define a single-column [surrogate key](/terms/surrogate-key) (for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source)). :::tip In cases where you need multiple columns in combination to uniquely identify each row, we recommend you pass these columns as a list (`unique_key = ['user_id', 'session_number']`), rather than a string expression (`unique_key = 'concat(user_id, session_number)'`). By using the first syntax, which is more universal, dbt can ensure that the columns will be templated into your incremental model materialization in a way that's appropriate to your database. + +When you pass a list in this way, please ensure that each column does not contain any nulls, or the incremental model run may fail. + +Alternatively, you can define a single-column [surrogate key](/terms/surrogate-key), for example with [`dbt_utils.generate_surrogate_key`](https://github.com/dbt-labs/dbt-utils#generate_surrogate_key-source). ::: - - When you define a `unique_key`, you'll see this behavior for each row of "new" data returned by your dbt model: -* If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database and [incremental strategy](#about-incremental_strategy). +* If the same `unique_key` is present in the "new" and "old" model data, dbt will update/replace the old row with the new row of data. The exact mechanics of how that update/replace takes place will vary depending on your database, [incremental strategy](#about-incremental_strategy), and [strategy specific configs](#strategy-specific-configs). * If the `unique_key` is _not_ present in the "old" data, dbt will insert the entire row into the table. +Please note that if there's a unique_key with more than one row in either the existing target table or the new incremental rows, the incremental model may fail depending on your database and [incremental strategy](#about-incremental_strategy). If you're having issues running an incremental model, it's a good idea to double check that the unique key is truly unique in both your existing database table and your new incremental rows. You can [learn more about surrogate keys here](/terms/surrogate-key). + :::info While common incremental strategies, such as`delete+insert` + `merge`, might use `unique_key`, others don't. For example, the `insert_overwrite` strategy does not use `unique_key`, because it operates on partitions of data rather than individual rows. For more information, see [About incremental_strategy](#about-incremental_strategy). ::: @@ -128,6 +128,7 @@ from raw_app_data.events {% if is_incremental() %} -- this filter will only be applied on an incremental run + -- (uses >= to include records arriving later on the same day as the last run of this model) where date_day >= (select max(date_day) from {{ this }}) {% endif %} @@ -149,7 +150,7 @@ $ dbt run --full-refresh --select my_incremental_model+ ``` It's also advisable to rebuild any downstream models, as indicated by the trailing `+`. -For detailed usage instructions, check out the [dbt run](run) documentation. +For detailed usage instructions, check out the [dbt run](/reference/commands/run) documentation. # Understanding incremental models ## When should I use an incremental model? @@ -160,7 +161,7 @@ It's often desirable to build models as tables in your data warehouse since down Like many things in programming, incremental models are a trade-off between complexity and performance. While they are not as straightforward as the `view` and `table` materializations, they can lead to significantly better performance of your dbt runs. ## Understanding the is_incremental() macro -The `is_incremental()` macro will return `True` if: +The `is_incremental()` macro will return `True` if _all_ of the following conditions are met: * the destination table already exists in the database * dbt is _not_ running in full-refresh mode * the running model is configured with `materialized='incremental'` @@ -177,11 +178,7 @@ Transaction management is used to ensure this is executed as a single unit of wo ## What if the columns of my incremental model change? -:::tip New `on_schema_change` config in dbt version `v0.21.0` - -Incremental models can now be configured to include an optional `on_schema_change` parameter to enable additional control when incremental model columns change. These options enable dbt to continue running incremental models in the presence of schema changes, resulting in fewer `--full-refresh` scenarios and saving query costs. - -::: +Incremental models can be configured to include an optional `on_schema_change` parameter to enable additional control when incremental model columns change. These options enable dbt to continue running incremental models in the presence of schema changes, resulting in fewer `--full-refresh` scenarios and saving query costs. You can configure the `on_schema_change` setting as follows. @@ -217,6 +214,12 @@ The possible values for `on_schema_change` are: **Note**: None of the `on_schema_change` behaviors backfill values in old records for newly added columns. If you need to populate those values, we recommend running manual updates, or triggering a `--full-refresh`. +:::caution `on_schema_change` tracks top-level changes + +Currently, `on_schema_change` only tracks top-level column changes. It does not track nested column changes. For example, on BigQuery, adding, removing, or modifying a nested column will not trigger a schema change, even if `on_schema_change` is set appropriately. + +::: + ### Default behavior This is the behavior if `on_schema_change: ignore`, which is set by default, and on older versions of dbt. @@ -227,15 +230,62 @@ Similarly, if you remove a column from your incremental model, and execute a `db Instead, whenever the logic of your incremental changes, execute a full-refresh run of both your incremental model and any downstream models. -## About incremental_strategy +## About `incremental_strategy` + +There are various ways (strategies) to implement the concept of an incremental materializations. The value of each strategy depends on: + +* the volume of data, +* the reliability of your `unique_key`, and +* the support of certain features in your data platform -On some adapters, an optional `incremental_strategy` config controls the code that dbt uses -to build incremental models. Different approaches may vary by effectiveness depending on the volume of data, -the reliability of your `unique_key`, or the availability of certain features. +An optional `incremental_strategy` config is provided in some adapters that controls the code that dbt uses +to build incremental models. -* [Snowflake](snowflake-configs#merge-behavior-incremental-models): `merge` (default), `delete+insert` (optional) -* [BigQuery](bigquery-configs#merge-behavior-incremental-models): `merge` (default), `insert_overwrite` (optional) -* [Spark](spark-configs#incremental-models): `append` (default), `insert_overwrite` (optional), `merge` (optional, Delta-only) +### Supported incremental strategies by adapter + +Click the name of the adapter in the below table for more information about supported incremental strategies. + +The `merge` strategy is available in dbt-postgres and dbt-redshift beginning in dbt v1.6. + + + + +| data platform adapter | default strategy | additional supported strategies | +| :-------------------| ---------------- | -------------------- | +| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | `append` | `delete+insert` | +| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | `append` | `delete+insert` | +| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | `merge` | `insert_overwrite` | +| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | `append` | `merge` (Delta only) `insert_overwrite` | +| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | `append` | `merge` (Delta only) `insert_overwrite` | +| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | `merge` | `append`, `delete+insert` | +| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | `append` | `merge` `delete+insert` | + + + + + + +| data platform adapter | default strategy | additional supported strategies | +| :----------------- | :----------------| : ---------------------------------- | +| [dbt-postgres](/reference/resource-configs/postgres-configs#incremental-materialization-strategies) | `append` | `merge` , `delete+insert` | +| [dbt-redshift](/reference/resource-configs/redshift-configs#incremental-materialization-strategies) | `append` | `merge`, `delete+insert` | +| [dbt-bigquery](/reference/resource-configs/bigquery-configs#merge-behavior-incremental-models) | `merge` | `insert_overwrite` | +| [dbt-spark](/reference/resource-configs/spark-configs#incremental-models) | `append` | `merge` (Delta only) `insert_overwrite` | +| [dbt-databricks](/reference/resource-configs/databricks-configs#incremental-models) | `append` | `merge` (Delta only) `insert_overwrite` | +| [dbt-snowflake](/reference/resource-configs/snowflake-configs#merge-behavior-incremental-models) | `merge` | `append`, `delete+insert` | +| [dbt-trino](/reference/resource-configs/trino-configs#incremental) | `append` | `merge` `delete+insert` | + + + + + +:::note Snowflake Configurations + +dbt v1.3 changed the default materialization for incremental table merges from `temporary table` to `view`. For more information about this change and instructions for setting the configuration to a temp table, please read about [Snowflake temporary tables](/reference/resource-configs/snowflake-configs#temporary-tables). + +::: + + ### Configuring incremental strategy @@ -270,14 +320,9 @@ select ...
-### Strategy-specific configs + - - - - **v0.20.0:** Introduced `merge_update_columns` - - **v0.21.0:** Introduced `on_schema_change` - - +### Strategy-specific configs If you are using the `merge` strategy and have specified a `unique_key`, by default, dbt will entirely overwrite matched rows with new values. @@ -300,5 +345,108 @@ select ... - +Alternatively, you can specify a list of columns to exclude from being updated by passing a list of column names to a `merge_exclude_columns` config. + + + +```sql +{{ + config( + materialized = 'incremental', + unique_key = 'id', + merge_exclude_columns = ['created_at'], + ... + ) +}} + +select ... +``` + + + + + + + +### About incremental_predicates + +`incremental_predicates` is an advanced use of incremental models, where data volume is large enough to justify additional investments in performance. This config accepts a list of any valid SQL expression(s). dbt does not check the syntax of the SQL statements. + +This an example of a model configuration in a `yml` file we might expect to see on Snowflake: + +```yml + +models: + - name: my_incremental_model + config: + materialized: incremental + unique_key: id + # this will affect how the data is stored on disk, and indexed to limit scans + cluster_by: ['session_start'] + incremental_strategy: merge + # this limits the scan of the existing table to the last 7 days of data + incremental_predicates: ["DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)"] + # `incremental_predicates` accepts a list of SQL statements. + # `DBT_INTERNAL_DEST` and `DBT_INTERNAL_SOURCE` are the standard aliases for the target table and temporary table, respectively, during an incremental run using the merge strategy. +``` + +Alternatively, here are the same same configurations configured within a model file: + +```sql +-- in models/my_incremental_model.sql + +{{ + config( + materialized = 'incremental', + unique_key = 'id', + cluster_by = ['session_start'], + incremental_strategy = 'merge', + incremental_predicates = [ + "DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date)" + ] + ) +}} + +... + +``` + +This will template (in the `dbt.log` file) a `merge` statement like: +```sql +merge into DBT_INTERNAL_DEST + from DBT_INTERNAL_SOURCE + on + -- unique key + DBT_INTERNAL_DEST.id = DBT_INTERNAL_SOURCE.id + and + -- custom predicate: limits data scan in the "old" data / existing table + DBT_INTERNAL_DEST.session_start > dateadd(day, -7, current_date) + when matched then update ... + when not matched then insert ... +``` + +Limit the data scan of _upstream_ tables within the body of their incremental model SQL, which will limit the amount of "new" data processed/transformed. + +```sql +with large_source_table as ( + + select * from {{ ref('large_source_table') }} + {% if is_incremental() %} + where session_start > dateadd(day, -3, current_date) + {% endif %} + +), + +... +``` + +:::info +The syntax depends on how you configure your `incremental_strategy`: +- If using the `merge` strategy, you may need to explicitly alias any columns with either `DBT_INTERNAL_DEST` ("old" data) or `DBT_INTERNAL_SOURCE` ("new" data). +- There's a decent amount of conceptual overlap with the `insert_overwrite` incremental strategy. +::: + + + + diff --git a/website/docs/docs/build/jinja-macros.md b/website/docs/docs/build/jinja-macros.md index 64ccd16d331..44bc85872f5 100644 --- a/website/docs/docs/build/jinja-macros.md +++ b/website/docs/docs/build/jinja-macros.md @@ -1,32 +1,33 @@ --- title: "Jinja and macros" +description: "Read this tutorial to learn how to use jinja and macros when building in dbt." id: "jinja-macros" --- ## Related reference docs * [Jinja Template Designer Documentation](https://jinja.palletsprojects.com/page/templates/) (external link) * [dbt Jinja context](/reference/dbt-jinja-functions) -* [Macro properties](macro-properties) +* [Macro properties](/reference/macro-properties) ## Overview In dbt, you can combine SQL with [Jinja](https://jinja.palletsprojects.com), a templating language. Using Jinja turns your dbt project into a programming environment for SQL, giving you the ability to do things that aren't normally possible in SQL. For example, with Jinja you can: * Use control structures (e.g. `if` statements and `for` loops) in SQL -* Use [environment variables](env_var) in your dbt project for production deployments +* Use [environment variables](/reference/dbt-jinja-functions/env_var) in your dbt project for production deployments * Change the way your project builds based on the current target. * Operate on the results of one query to generate another query, for example: * Return a list of payment methods, in order to create a subtotal column per payment method (pivot) * Return a list of columns in two relations, and select them in the same order to make it easier to union them together * Abstract snippets of SQL into reusable [**macros**](#macros) — these are analogous to functions in most programming languages. -In fact, if you've used the [`{{ ref() }}` function](ref), you're already using Jinja! +In fact, if you've used the [`{{ ref() }}` function](/reference/dbt-jinja-functions/ref), you're already using Jinja! -Jinja can be used in any SQL in a dbt project, including [models](/docs/build/sql-models), [analyses](analyses), [tests](/docs/build/tests), and even [hooks](hooks-operations). +Jinja can be used in any SQL in a dbt project, including [models](/docs/build/sql-models), [analyses](/docs/build/analyses), [tests](/docs/build/tests), and even [hooks](/docs/build/hooks-operations). :::info Ready to get started with Jinja and macros? -Check out the [tutorial on using Jinja](using-jinja) for step-by-step example of using Jinja in a model, and turning it into a macro! +Check out the [tutorial on using Jinja](/guides/advanced/using-jinja) for a step-by-step example of using Jinja in a model, and turning it into a macro! ::: @@ -69,7 +70,7 @@ group by 1 You can recognize Jinja based on the delimiters the language uses, which we refer to as "curlies": -- **Expressions `{{ ... }}`**: Expressions are used when you want to output a string. You can use expressions to reference [variables](var) and call [macros](jinja-macros#macros). +- **Expressions `{{ ... }}`**: Expressions are used when you want to output a string. You can use expressions to reference [variables](/reference/dbt-jinja-functions/var) and call [macros](/docs/build/jinja-macros#macros). - **Statements `{% ... %}`**: Statements are used for control flow, for example, to set up `for` loops and `if` statements, or to define macros. - **Comments `{# ... #}`**: Jinja comments are used to prevent the text within the comment from compiling. @@ -78,7 +79,7 @@ When used in a dbt model, your Jinja needs to compile to a valid query. To check * **Using the dbt CLI:** Run `dbt compile` from the command line. Then open the compiled SQL file in the `target/compiled/{project name}/` directory. Use a split screen in your code editor to keep both files open at once. ### Macros -[Macros](jinja-macros#macros) in Jinja are pieces of code that can be reused multiple times – they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. Macros are defined in `.sql` files, typically in your `macros` directory ([docs](macro-paths)). +[Macros](/docs/build/jinja-macros) in Jinja are pieces of code that can be reused multiple times – they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. Macros are defined in `.sql` files, typically in your `macros` directory ([docs](/reference/project-configs/macro-paths)). Macro files can contain one or more macros — here's an example: @@ -86,8 +87,8 @@ Macro files can contain one or more macros — here's an example: ```sql -{% macro cents_to_dollars(column_name, precision=2) %} - ({{ column_name }} / 100)::numeric(16, {{ precision }}) +{% macro cents_to_dollars(column_name, scale=2) %} + ({{ column_name }} / 100)::numeric(16, {{ scale }}) {% endmacro %} ``` @@ -125,9 +126,9 @@ from app_data.payments ### Using a macro from a package -A number of useful macros have also been grouped together into [packages](docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). +A number of useful macros have also been grouped together into [packages](/docs/build/packages) — our most popular package is [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). -After installing a package into your project, you can use any of the macros in your own project — make sure you qualify the macro by prefixing it with the [package name](project-configs/name): +After installing a package into your project, you can use any of the macros in your own project — make sure you qualify the macro by prefixing it with the [package name](/reference/dbt-jinja-functions/project_name): ```sql @@ -139,21 +140,21 @@ select field_5, count(*) from my_table -{{ dbt_utils.group_by(5) }} +{{ dbt_utils.dimensions(5) }} ``` -You can also qualify a macro in your own project by prefixing it with your [package name](project-configs/name) (this is mainly useful for package authors). +You can also qualify a macro in your own project by prefixing it with your [package name](/reference/dbt-jinja-functions/project_name) (this is mainly useful for package authors). ## FAQs - - - - - - - + + + + + + + ## dbtonic Jinja @@ -184,5 +185,5 @@ Writing a macro for the first time? Check whether we've open sourced one in [dbt {% endfor %} ``` - - + + diff --git a/website/docs/docs/build/join-logic.md b/website/docs/docs/build/join-logic.md new file mode 100644 index 00000000000..9039822c9fd --- /dev/null +++ b/website/docs/docs/build/join-logic.md @@ -0,0 +1,144 @@ +--- +title: Joins +id: join-logic +description: "Joins allow you to combine data from different tables and create new metrics" +sidebar_label: "Joins" +tags: [Metrics, Semantic Layer] +--- + +Joins are a powerful part of MetricFlow and simplify the process of making all valid dimensions available for your metrics at query time, regardless of where they are defined in different semantic models. With Joins, you can also create metrics using measures from different semantic models. + +Joins use `entities` defined in your semantic model configs as the join keys between tables. Assuming entities are defined in the semantic model, MetricFlow creates a graph using the semantic models as nodes and the join paths as edges to perform joins automatically. MetricFlow chooses the appropriate join type and avoids fan-out or chasm joins with other tables based on the entity types. + +
+ What are fan-out or chasm joins? +
+
— Fan-out joins are when one row in a table is joined to multiple rows in another table, resulting in more output rows than input rows.

+ — Chasm joins are when two tables have a many-to-many relationship through an intermediate table, and the join results in duplicate or missing data.
+
+
+ + +## Types of joins + +:::tip Joins are auto-generated +MetricFlow automatically generates the necessary joins to the defined semantic objects, eliminating the need for you to create new semantic models or configuration files. + +This document explains the different types of joins that can be used with entities and how to query them using the CLI. +::: + +MetricFlow primarily uses left joins for joins, and restricts the use of fan-out and chasm joins. Refer to the table below to identify which joins are or aren't allowed based on specific entity types to prevent the creation of risky joins. + +| entity type - Table A | entity type - Table B | Join type | +|---------------------------|---------------------------|----------------------| +| Primary | Primary | ✅ Left | +| Primary | Unique | ✅ Left | +| Primary | Foreign | ❌ Fan-out (Not allowed) | +| Unique | Primary | ✅ Left | +| Unique | Unique | ✅ Left | +| Unique | Foreign | ❌ Fan-out (Not allowed) | +| Foreign | Primary | ✅ Left | +| Foreign | Unique | ✅ Left | +| Foreign | Foreign | ❌ Fan-out (Not allowed) | + +### Example + +The following example uses two semantic models with a common entity and shows a MetricFlow query that requires a join between the two semantic models. + +Let's say you have two semantic models, `transactions` and `user_signup` as seen in the following example: + +```yaml +semantic_models: + - name: transactions + entities: + - name: id + type: primary + - name: user + type: foreign + expr: user_id + measures: + - name: average_purchase_price + agg: avg + expr: purchase_price + - name: user_signup + entities: + - name: user + type: primary + expr: user_id + dimensions: + - name: type + type: categorical +``` + +MetricFlow will use `user_id` as the join key to join two semantic models, `transactions` and `user_signup`. This enables you to query the `average_purchase_price` metric in `transactions`, sliced by the `type` dimension in the `user_signup` semantic model. + +Note that the `average_purchase_price` measure is defined in the `transactions` semantic model, where `user_id` is a foreign entity. However, the `user_signup` semantic model has `user_id` as a primary entity. + +Since this is a foreign-to-primary relationship, a left join is implemented where the `transactions` semantic model joins the `user_signup` semantic model, since the `average_purchase_price` measure is defined in the `transactions` semantic model. + +When querying dimensions from different semantic models using the CLI, a double underscore (or dunder) is added to the dimension name after the joining entity. In the CLI query shown below, `user_id__type` is included as a `dimension`. + +```yaml +mf query --metrics average_purchase_price --dimensions metric_time,user_id__type +``` + +## Multi-hop joins + +:::info +This feature is currently in development and not currently available. +::: + +MetricFlow allows users to join measures and dimensions across a graph of entities, which we refer to as a 'multi-hop join.' This is because users can move from one table to another like a 'hop' within a graph. + +Here's an example schema for reference: + +![Multi-Hop-Join](/img/docs/building-a-dbt-project/multihop-diagram.png) + +Notice how this schema can be translated into the three MetricFlow semantic models below to create the metric 'Average purchase price by country' using the `purchase_price` measure from the sales table and the `country_name` dimension from the `country_dim` table. + +```yaml +semantic_models: + - name: sales + defaults: + agg_time_dimension: first_ordered_at + entities: + - name: id + type: primary + - name: user_id + type: foreign + measures: + - name: average_purchase_price + agg: avg + expr: purchase_price + dimensions: + - name: metric_time + type: time + type_params: + - name: user_signup + entities: + - name: user_id + type: primary + - name: country_id + type: Unique + dimensions: + - name: signup_date + type: time + - name: country_dim + entities: + - name: country_id + type: primary + dimensions: + - name: country_name + type: categorical +``` + +### Query multi-hop joins + +:::info +This feature is currently in development and not currently available. +::: + +To query dimensions _without_ a multi-hop join involved, you can use the fully qualified dimension name with the syntax entity double underscore (dunder) dimension, like `entity__dimension`. + +For dimensions retrieved by a multi-hop join, you need to additionally provide the entity path as a list, like `user_id`. + diff --git a/website/docs/docs/build/materializations.md b/website/docs/docs/build/materializations.md index c89199edd83..79fe2e1b7c5 100644 --- a/website/docs/docs/build/materializations.md +++ b/website/docs/docs/build/materializations.md @@ -1,16 +1,18 @@ --- title: "Materializations" +description: "Read this tutorial to learn how to use materializations when building in dbt." id: "materializations" +pagination_next: "docs/build/incremental-models" --- ## Overview - -Materializations are strategies for persisting dbt models in a warehouse. There are four types of materializations built into dbt. They are: +Materializations are strategies for persisting dbt models in a warehouse. There are five types of materializations built into dbt. They are: - - - incremental - ephemeral +- materialized view ## Configuring materializations @@ -45,7 +47,7 @@ models: -Alternatively, materializations can be configured directly inside of the model sql files. This can be useful if you are also setting [Performance Optimization] configs for specific models (for example, [Redshift specific configurations](redshift-configs) or [BigQuery specific configurations](bigquery-configs)). +Alternatively, materializations can be configured directly inside of the model sql files. This can be useful if you are also setting [Performance Optimization] configs for specific models (for example, [Redshift specific configurations](/reference/resource-configs/redshift-configs) or [BigQuery specific configurations](/reference/resource-configs/bigquery-configs)). @@ -65,7 +67,7 @@ from ... ### View When using the `view` materialization, your model is rebuilt as a view on each run, via a `create view as` statement. * **Pros:** No additional data is stored, views on top of source data will always have the latest records in them. -* **Cons:** Views that perform significant transformation, or are stacked on top of other views, are slow to query. +* **Cons:** Views that perform a significant transformation, or are stacked on top of other views, are slow to query. * **Advice:** * Generally start with views for your models, and only change to another materialization when you're noticing performance problems. * Views are best suited for models that do not do significant transformation, e.g. renaming, recasting columns. @@ -82,7 +84,7 @@ When using the `table` materialization, your model is rebuilt as a + diff --git a/website/docs/docs/build/measures.md b/website/docs/docs/build/measures.md new file mode 100644 index 00000000000..e06b5046976 --- /dev/null +++ b/website/docs/docs/build/measures.md @@ -0,0 +1,253 @@ +--- +title: Measures +id: measures +description: "Measures are aggregations performed on columns in your model." +sidebar_label: "Measures" +tags: [Metrics, Semantic Layer] +--- + +Measures are aggregations performed on columns in your model. They can be used as final metrics or serve as building blocks for more complex metrics. Measures have several inputs, which are described in the following table along with their field types. + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| [`name`](#name) | Provide a name for the measure, which must be unique and can't be repeated across all semantic models in your dbt project. | Required | +| [`description`](#description) | Describes the calculated measure. | Optional | +| [`agg`](#aggregation) | dbt supports aggregations such as `sum`, `min`, `max`, and more. Refer to [Aggregation](/docs/build/measures#aggregation) for the full list of supported aggregation types. | Required | +| [`expr`](#expr) | You can either reference an existing column in the table or use a SQL expression to create or derive a new one. | Optional | +| [`non_additive_dimension`](#non-additive-dimensions) | Non-additive dimensions can be specified for measures that cannot be aggregated over certain dimensions, such as bank account balances, to avoid producing incorrect results. | Optional | +| `agg_params` | specific aggregation properties such as a percentile. | Optional | +| `agg_time_dimension` | The time field. Defaults to the default agg time dimension for the semantic model. | Optional | +| `label` | How the metric appears in project docs and downstream integrations. | Required | + + +## Measure spec + +An example of the complete YAML measures spec is below. The actual configuration of your measures will depend on the aggregation you're using. + +```yaml +measures: + - name: The name of the measure + description: 'same as always' ## Optional + agg: the aggregation type. + expr: the field + agg_params: 'specific aggregation properties such as a percentile' ## Optional + agg_time_dimension: The time field. Defaults to the default agg time dimension for the semantic model. ## Optional + non_additive_dimension: 'Use these configs when you need non-additive dimensions.' ## Optional + label: How the metric appears in project docs and downstream integrations. ## Required +``` + +### Name + +When you create a measure, you can either give it a custom name or use the `name` of the data platform column directly. If the `name` of the measure is different from the column name, you need to add an `expr` to specify the column name. The `name` of the measure is used when creating a metric. + +Measure names must be **unique** across all semantic models in a project. + +### Description + +The description describes the calculated measure. It's strongly recommended you create verbose and human-readable descriptions in this field. + +### Aggregation + +The aggregation determines how the field will be aggregated. For example, a `sum` aggregation type over a granularity of `day` would sum the values across a given day. + +Supported aggregations include: + +| Aggregation types | Description | +|-------------------|--------------------------| +| sum | Sum across the values | +| min | Minimum across the values| +| max | Maximum across the values| +| average | Average across the values | +| sum_boolean | A sum for a boolean type | +| count_distinct | Distinct count of values | +| median | Median (p50) calculation across the values | +| percentile | Percentile calculation across the values | + + +### Expr + +If the `name` you specified for a measure doesn't match a column name in your model, you can use the `expr` parameter instead. This allows you to use any valid SQL to manipulate an underlying column name into a specific output. The `name` parameter then serves as an alias for your measure. + +**Notes**: When using SQL functions in the `expr` parameter, **always use data platform-specific SQL**. This is because outputs may differ depending on your specific data platform. + +:::tip For Snowflake users +For Snowflake users, if you use a week-level function in the `expr` parameter, it'll now return Monday as the default week start day based on ISO standards. If you have any account or session level overrides for the `WEEK_START` parameter that fix it to a value other than 0 or 1, you will still see Monday as the week start. + +If you use the `dayofweek` function in the `expr` parameter with the legacy Snowflake default of `WEEK_START = 0`, it will now return ISO-standard values of 1 (Monday) through 7 (Sunday) instead of Snowflake's legacy default values of 0 (Monday) through 6 (Sunday). +::: + + +### Model with different aggregations + +```yaml +semantic_models: + - name: transactions + description: A record of every transaction that takes place. Carts are considered multiple transactions for each SKU. + model: ref('schema.transactions') + defaults: + agg_time_dimensions: + +# --- entities --- + entities: + - name: transaction_id + type: primary + - name: customer_id + type: foreign + - name: store_id + type: foreign + - name: product_id + type: foreign + + # --- measures --- + measures: + - name: transaction_amount_usd + description: Total USD value of transactions + expr: transaction_amount_usd + agg: sum + - name: transaction_amount_usd_avg + description: Average USD value of transactions + expr: transaction_amount_usd + agg: average + - name: transaction_amount_usd_max + description: Maximum USD value of transactions + expr: transaction_amount_usd + agg: max + - name: transaction_amount_usd_min + description: Minimum USD value of transactions + expr: transaction_amount_usd + agg: min + - name: quick_buy_transactions + description: The total transactions bought as quick buy + expr: quick_buy_flag + agg: sum_boolean + - name: distinct_transactions_count + description: Distinct count of transactions + expr: transaction_id + agg: count_distinct + - name: transactions + description: The average value of transactions + expr: transaction_amount_usd + agg: average + - name: transactions_amount_usd_valid #Notice here how we use expr to compute the aggregation based on a condition + description: The total USD value of valid transactions only + expr: CASE WHEN is_valid = True then 1 else 0 end + agg: sum + - name: transactions + description: The average value of transactions. + expr: transaction_amount_usd + agg: average + - name: p99_transaction_value + description: The 99th percentile transaction value + expr: transaction_amount_usd + agg: percentile + agg_params: + percentile: .99 + use_discrete_percentile: False #False will calculate the discrete percentile and True will calculate the continuous percentile + - name: median_transaction_value + description: The median transaction value + expr: transaction_amount_usd + agg: median + +# --- dimensions --- + dimensions: + - name: metric_time + type: time + expr: date_trunc('day', ts) #expr refers to underlying column ts + type_params: + time_granularity: day + - name: is_bulk_transaction + type: categorical + expr: case when quantity > 10 then true else false end + +``` + +### Non-additive dimensions + +Some measures cannot be aggregated over certain dimensions, like time, because it could result in incorrect outcomes. Examples include bank account balances where it does not make sense to carry over balances month-to-month, and monthly recurring revenue where daily recurring revenue cannot be summed up to achieve monthly recurring revenue. You can specify non-additive dimensions to handle this, where certain dimensions are excluded from aggregation. + +To demonstrate the configuration for non-additive measures, consider a subscription table that includes one row per date of the registered user, the user's active subscription plan(s), and the plan's subscription value (revenue) with the following columns: + +- `date_transaction`: The daily date-spine. +- `user_id`: The ID pertaining to the registered user. +- `subscription_plan`: A column to indicate the subscription plan ID. +- `subscription_value`: A column to indicate the monthly subscription value (revenue) of a particular subscription plan ID. + +Parameters under the `non_additive_dimension` will specify dimensions that the measure should not be aggregated over. + +| Parameter | Description | Field type | +| --- | --- | --- | +| `name`| This will be the name of the time dimension (that has already been defined in the data source) that the measure should not be aggregated over. | Required | +| `window_choice` | Choose either `min` or `max`, where `min` reflects the beginning of the time period and `max` reflects the end of the time period. | Required | +| `window_groupings` | Provide the entities that you would like to group by. | Optional | + + +```yaml +semantic_models: + - name: subscription_table + description: A subscription table with one row per date for each active user and their subscription plans. + model: ref('your_schema.subscription_table') + defaults: + agg_time_dimension: metric_time + + entities: + - name: user_id + type: foreign + + dimensions: + - name: metric_time + type: time + expr: date_transaction + type_params: + time_granularity: day + + measures: + - name: count_users_end_of_month + description: Count of users at the end of the month + expr: 1 + agg: sum + non_additive_dimension: + name: metric_time + window_choice: min + - name: mrr_end_of_month + description: Aggregate by summing all users' active subscription plans at the end of month + expr: subscription_value + agg: sum + non_additive_dimension: + name: metric_time + window_choice: max + - name: mrr_by_user_end_of_month + description: Group by user_id to achieve each user's MRR at the end of the month + expr: subscription_value + agg: sum + non_additive_dimension: + name: metric_time + window_choice: max + window_groupings: + - user_id + +metrics: + - name: mrr_end_of_month + type: simple + type_params: + measure: mrr_end_of_month +``` + +We can query the semi-additive metrics using the following syntax: + +For dbt Cloud: + +```bash +dbt sl query --metrics mrr_by_end_of_month --dimensions metric_time__month --order metric_time__month +dbt sl query --metrics mrr_by_end_of_month --dimensions metric_time__week --order metric_time__week +``` + +For dbt Core: + +```bash +mf query --metrics mrr_by_end_of_month --dimensions metric_time__month --order metric_time__month +mf query --metrics mrr_by_end_of_month --dimensions metric_time__week --order metric_time__week +``` + +import SetUpPages from '/snippets/_metrics-dependencies.md'; + + diff --git a/website/docs/docs/build/metricflow-commands.md b/website/docs/docs/build/metricflow-commands.md new file mode 100644 index 00000000000..049b415d40e --- /dev/null +++ b/website/docs/docs/build/metricflow-commands.md @@ -0,0 +1,554 @@ +--- +title: MetricFlow commands +id: metricflow-commands +description: "Query metrics and metadata in your dbt project with the MetricFlow commands." +sidebar_label: "MetricFlow commands" +tags: [Metrics, Semantic Layer] +--- + +Once you define metrics in your dbt project, you can query metrics, dimensions, dimension values, and validate your configs using the MetricFlow commands. + +MetricFlow allows you to define and query metrics in your dbt project in the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation), [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud), or [dbt Core](/docs/core/installation). To experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and dynamically query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. + +MetricFlow is compatible with Python versions 3.8, 3.9, 3.10 and 3.11. + + +## MetricFlow + +MetricFlow is a dbt package that allows you to define and query metrics in your dbt project. You can use MetricFlow to query metrics in your dbt project in the dbt Cloud CLI, dbt Cloud IDE, or dbt Core. + +**Note** — MetricFlow commands aren't supported in dbt Cloud jobs yet. However, you can add MetricFlow validations with your git provider (such as GitHub Actions) by installing MetricFlow (`pip install metricflow`). This allows you to run MetricFlow commands as part of your continuous integration checks on PRs. + + + + + +MetricFlow commands are embedded in the dbt Cloud CLI, which means you can immediately run them once you install the dbt Cloud CLI. + +A benefit to using the dbt Cloud CLI or dbt Cloud IDE is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning for you. + + + + + +:::info +You can create metrics using MetricFlow in the dbt Cloud IDE. However, support for running MetricFlow commands in the IDE will be available soon. +::: + + + + + + +:::info Use dbt Cloud CLI for semantic layer development + +Use the dbt Cloud CLI for the experience in defining and querying metrics in your dbt project on dbt Cloud or dbt Core with MetricFlow. + +A benefit to using the dbt Cloud CLI or dbt Cloud IDE is that you won't need to manage versioning — your dbt Cloud account will automatically manage the versioning for you. +::: + + +You can install [MetricFlow](https://github.com/dbt-labs/metricflow#getting-started) from [PyPI](https://pypi.org/project/dbt-metricflow/). You need to use `pip` to instal MetricFlow on Windows or Linux operating systems: + +1. Create or activate your virtual environment`python -m venv venv` +2. Run `pip install dbt-metricflow` + * You can install MetricFlow using PyPI as an extension of your dbt adapter in the command line. To install the adapter, run `pip install "dbt-metricflow[your_adapter_name]"` and add the adapter name at the end of the command. For example, for a Snowflake adapter run `pip install "dbt-metricflow[snowflake]"` + +**Note**, you'll need to manage versioning between dbt Core, your adapter, and MetricFlow. + + + + + + +## MetricFlow commands + +MetricFlow provides the following commands to retrieve metadata and query metrics. + + + + +Use the `dbt sl` prefix before the command name to execute them in dbt Cloud. For example, to list all metrics, run `dbt sl list metrics`. + +- [`list`](#list) — Retrieves metadata values. +- [`list metrics`](#list-metrics) — Lists metrics with dimensions. +- [`list dimensions`](#list) — Lists unique dimensions for metrics. +- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. +- [`list entities`](#list-entities) — Lists all unique entities. +- [`query`](#query) — Query metrics and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. + + + + + + + +Use the `mf` prefix before the command name to execute them in dbt Core. For example, to list all metrics, run `mf list metrics`. + +- [`list`](#list) — Retrieves metadata values. +- [`list metrics`](#list-metrics) — Lists metrics with dimensions. +- [`list dimensions`](#list) — Lists unique dimensions for metrics. +- [`list dimension-values`](#list-dimension-values) — List dimensions with metrics. +- [`list entities`](#list-entities) — Lists all unique entities. +- [`validate-configs`](#validate-configs) — Validates semantic model configurations. +- [`health-checks`](#health-checks) — Performs data platform health check. +- [`tutorial`](#tutorial) — Dedicated MetricFlow tutorial to help get you started. +- [`query`](#query) — Query metrics and dimensions you want to see in the command line interface. Refer to [query examples](#query-examples) to help you get started. + + + + +### List + +This command retrieves metadata values related to [Metrics](/docs/build/metrics-overview), [Dimensions](/docs/build/dimensions), and [Entities](/docs/build/entities) values. + + +### List metrics + +```bash +dbt sl list # In dbt Cloud +mf list # In dbt Core +``` +This command lists the metrics with their available dimensions: + +```bash +dbt sl list metrics # In dbt Cloud + +mf list metrics # In dbt Core + +Options: + --search TEXT Filter available metrics by this search term + --show-all-dimensions Show all dimensions associated with a metric. + --help Show this message and exit. +``` + +### List dimensions + +This command lists all unique dimensions for a metric or multiple metrics. It displays only common dimensions when querying multiple metrics: + +```bash +dbt sl list dimensions --metrics # In dbt Cloud + +mf list dimensions --metrics # In dbt Core + +Options: + --metrics SEQUENCE List dimensions by given metrics (intersection). Ex. --metrics bookings,messages + --help Show this message and exit. +``` + +## List dimension-values + +This command lists all dimension values with the corresponding metric: + +```bash +dbt sl list dimension-values --metrics --dimension # In dbt Cloud + +mf list dimension-values --metrics --dimension # In dbt Core + +Options: + --dimension TEXT Dimension to query values from [required] + --metrics SEQUENCE Metrics that are associated with the dimension + [required] + --end-time TEXT Optional iso8601 timestamp to constraint the end time of + the data (inclusive) + --start-time TEXT Optional iso8601 timestamp to constraint the start time + of the data (inclusive) + --help Show this message and exit. +``` + +### List entities + +This command lists all unique entities: + +```bash +dbt sl list entities --metrics # In dbt Cloud + +mf list entities --metrics # In dbt Core + +Options: + --metrics SEQUENCE List entities by given metrics (intersection). Ex. --metrics bookings,messages + --help Show this message and exit. +``` + +### Validate-configs + +This command performs validations against the defined semantic model configurations: + +```bash +dbt sl validate-configs # In dbt Cloud + +mf validate-configs # In dbt Core + +Options: + --dw-timeout INTEGER Optional timeout for data warehouse + validation steps. Default None. + --skip-dw If specified, skips the data warehouse + validations + --show-all If specified, prints warnings and future- + errors + --verbose-issues If specified, prints any extra details + issues might have + --semantic-validation-workers INTEGER + Optional. Uses the number of workers + specified to run the semantic validations. + Should only be used for exceptionally large + configs + --help Show this message and exit. +``` + +### Health checks + +This command performs a health check against the data platform you provided in the configs: + +```bash +dbt sl health-checks #in dbt Cloud + +mf health-checks #in dbt Core +``` + +### Tutorial + +Follow the dedicated MetricFlow tutorial to help you get started: + +```bash +dbt sl tutorial # In dbt Cloud + +mf tutorial # In dbt Core +``` + +### Query + +Create a new query with MetricFlow, execute that query against the user's data platform, and return the result: + +```bash +dbt sl query --metrics --group-by # In dbt Cloud + +mf query --metrics --group-by # In dbt Core + +Options: + + --metrics SEQUENCE Metrics to query for: syntax is --metrics bookings + or for multiple metrics --metrics bookings, messages. + + --group-by SEQUENCE Dimensions and/or entities to group by: syntax is + --group-by ds or for multiple group bys --group-by + ds, org. + + --end-time TEXT Optional iso8601 timestamp to constraint the end + time of the data (inclusive) + + --start-time TEXT Optional iso8601 timestamp to constraint the start + time of the data (inclusive) + + --where TEXT SQL-like where statement provided as a string. For + example: --where "revenue > 100". To add a dimension filter to + a where filter, you have to indicate that the filter item is part of your model. + Refer to the [FAQ](#faqs) for more info on how to do this using a template wrapper. + + --limit TEXT Limit the number of rows out using an int or leave + blank for no limit. For example: --limit 100 + + --order SEQUENCE Metrics or group bys to order by ("-" prefix for + DESC). For example: --order -ds or --order + ds,-revenue + + --csv FILENAME Provide filepath for data frame output to csv + + --compile (dbt Cloud) In the query output, show the query that was + --explain (dbt Core) executed against the data warehouse + + + --show-dataflow-plan Display dataflow plan in explain output + + --display-plans Display plans (such as metric dataflow) in the browser + + --decimals INTEGER Choose the number of decimal places to round for + the numerical values + + --show-sql-descriptions Shows inline descriptions of nodes in displayed SQL + + --help Show this message and exit. + ``` + + +### Query examples + +The following tabs present various different types of query examples that you can use to query metrics and dimensions. Select the tab that best suits your needs: + + + + + +Use the example to query metrics by dimension and return the `order_total` metric by `metric_time.` + +**Query** +```bash +dbt sl query --metrics order_total --group-by metric_time # In dbt Cloud + +mf query --metrics order_total --group-by metric_time # In dbt Core +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.24 seconds +| METRIC_TIME | ORDER_TOTAL | +|:--------------|---------------:| +| 2017-06-16 | 792.17 | +| 2017-06-17 | 458.35 | +| 2017-06-18 | 490.69 | +| 2017-06-19 | 749.09 | +| 2017-06-20 | 712.51 | +| 2017-06-21 | 541.65 | +``` + + + + +You can include multiple dimensions in a query. For example, you can group by the `is_food_order` dimension to confirm if orders were for food or not. + +**Query** +```bash +dbt sl query --metrics order_total --group-by metric_time, is_food_order # In dbt Cloud + +mf query --metrics order_total --group-by metric_time, is_food_order # In dbt Core +``` + +**Result** +```bash + Success 🦄 - query completed after 1.70 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-06-16 | True | 499.27 | +| 2017-06-16 | False | 292.90 | +| 2017-06-17 | True | 431.24 | +| 2017-06-17 | False | 27.11 | +| 2017-06-18 | True | 466.45 | +| 2017-06-18 | False | 24.24 | +| 2017-06-19 | False | 300.98 | +| 2017-06-19 | True | 448.11 | +``` + + + + + + +You can add order and limit functions to filter and present the data in a readable format. The following query limits the data set to 10 records and orders them by `metric_time`, descending. + +**Query** +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 1.41 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-31 | False | 327.08 | +| 2017-08-30 | False | 348.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-29 | False | 333.65 | +| 2017-08-28 | False | 334.73 | +``` + + + + +You can further filter the data set by adding a `where` clause to your query. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True" + +# In dbt Core +mf query --metrics order_total --group-by metric_time --where "{{ Dimension('order_id__is_food_order') }} = True" +``` + +**Result** +```bash + ✔ Success 🦄 - query completed after 1.06 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-31 | True | 459.90 | +| 2017-08-30 | True | 448.18 | +| 2017-08-29 | True | 479.94 | +| 2017-08-28 | True | 513.48 | +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + +To filter by time, there are dedicated start and end time options. Using these options to filter by time allows MetricFlow to further optimize query performance by pushing down the where filter when appropriate. + +**Query** +```bash + +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' +``` + + **Result** +```bash +✔ Success 🦄 - query completed after 1.53 seconds +| METRIC_TIME | IS_FOOD_ORDER | ORDER_TOTAL | +|:--------------|:----------------|---------------:| +| 2017-08-27 | True | 568.92 | +| 2017-08-26 | True | 471.95 | +| 2017-08-25 | True | 452.93 | +| 2017-08-24 | True | 384.40 | +| 2017-08-23 | True | 423.61 | +| 2017-08-22 | True | 401.91 | +``` + + + + + + + +### Additional query examples + +The following tabs present additional query examples, like exporting to a CSV. Select the tab that best suits your needs: + + + + + + + +Add `--compile` (or `--explain` for dbt Core users) to your query to view the SQL generated by MetricFlow. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --compile + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --explain +``` + + **Result** + ```bash + ✔ Success 🦄 - query completed after 0.28 seconds +🔎 SQL (remove --compile to see data or add --show-dataflow-plan to see the generated dataflow plan): +SELECT + metric_time + , is_food_order + , SUM(order_cost) AS order_total +FROM ( + SELECT + cast(ordered_at as date) AS metric_time + , is_food_order + , order_cost + FROM ANALYTICS.js_dbt_sl_demo.orders orders_src_1 + WHERE cast(ordered_at as date) BETWEEN CAST('2017-08-22' AS TIMESTAMP) AND CAST('2017-08-27' AS TIMESTAMP) +) subq_3 +WHERE is_food_order = True +GROUP BY + metric_time + , is_food_order +ORDER BY metric_time DESC +LIMIT 10 +``` + + + + + +Add the `--csv file_name.csv` flag to export the results of your query to a csv. + +**Query** + +```bash +# In dbt Cloud +dbt sl query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv + +# In dbt Core +mf query --metrics order_total --group-by metric_time,is_food_order --limit 10 --order -metric_time --where "is_food_order = True" --start-time '2017-08-22' --end-time '2017-08-27' --csv query_example.csv +``` + +**Result** +```bash +✔ Success 🦄 - query completed after 0.83 seconds +🖨 Successfully written query output to query_example.csv +``` + + + + +### Time granularity + +Optionally, you can specify the time granularity you want your data to be aggregated at by appending two underscores and the unit of granularity you want to `metric_time`, the global time dimension. You can group the granularity by: `day`, `week`, `month`, `quarter`, and `year`. + +Below is an example for querying metric data at a monthly grain: + +```bash +dbt sl query --metrics revenue --group-by metric_time__month # In dbt Cloud + +mf query --metrics revenue --group-by metric_time__month # In dbt Core +``` + +## FAQs + +
+How can I add a dimension filter to a where filter? + +To add a dimension filter to a where filter, you have to indicate that the filter item is part of your model and use a template wrapper: {{Dimension('primary_entity__dimension_name')}}. + +Here's an example query: dbt sl query --metrics order_total --group-by metric_time --where "{{Dimension('order_id__is_food_order')}} = True".

Before using the template wrapper, however, you will need to set up your terminal to escape curly braces for the filter template to work. + +
+How to set up your terminal to escape curly braces? + To configure your .zshrcprofile to escape curly braces, you can use the setopt command to enable the BRACECCL option. This option will cause the shell to treat curly braces as literals and prevent brace expansion. Refer to the following steps to set it up:
+ +1. Open your terminal. +2. Open your .zshrc file using a text editor like nano, vim, or any other text editor you prefer. You can use the following command to open it with nano: + +```bash +nano ~/.zshrc +``` +3. Add the following line to the file: + +```bash +setopt BRACECCL +``` +4. Save and exit the text editor (in `nano`, press Ctrl + O to save, and Ctrl + X to exit). + +5. Source your .zshrc file to apply the changes: + +```bash +source ~/.zshrc +``` + +6. After making these changes, your Zsh shell will treat curly braces as literal characters and will not perform brace expansion. This means that you can use curly braces without worrying about unintended expansions. + +Keep in mind that modifying your shell configuration files can have an impact on how your shell behaves. If you're not familiar with shell configuration, it's a good idea to make a backup of your .zshrc file before making any changes. If you encounter any issues or unexpected behavior, you can revert to the backup. + + +
+ +
diff --git a/website/docs/docs/build/metricflow-time-spine.md b/website/docs/docs/build/metricflow-time-spine.md new file mode 100644 index 00000000000..997d85e38a8 --- /dev/null +++ b/website/docs/docs/build/metricflow-time-spine.md @@ -0,0 +1,131 @@ +--- +title: MetricFlow time spine +id: metricflow-time-spine +description: "MetricFlow expects a default timespine table called metricflow_time_spine" +sidebar_label: "MetricFlow time spine" +tags: [Metrics, Semantic Layer] +--- + +MetricFlow uses a timespine table to construct cumulative metrics. By default, MetricFlow expects the timespine table to be named `metricflow_time_spine` and doesn't support using a different name. + +To create this table, you need to create a model in your dbt project called `metricflow_time_spine` and add the following code: + + + + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt_utils.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2027','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +``` + + + + + +```sql +{{ + config( + materialized = 'table', + ) +}} + +with days as ( + + {{ + dbt.date_spine( + 'day', + "to_date('01/01/2000','mm/dd/yyyy')", + "to_date('01/01/2027','mm/dd/yyyy')" + ) + }} + +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * from final +``` + + + + + + + +```sql +-- filename: metricflow_time_spine.sql +-- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery +{{config(materialized='table')}} +with days as ( + {{dbt_utils.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2030,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +``` + + + + + +```sql +-- filename: metricflow_time_spine.sql +-- BigQuery supports DATE() instead of TO_DATE(). Use this model if you're using BigQuery +{{config(materialized='table')}} +with days as ( + {{dbt.date_spine( + 'day', + "DATE(2000,01,01)", + "DATE(2030,01,01)" + ) + }} +), + +final as ( + select cast(date_day as date) as date_day + from days +) + +select * +from final +``` + + + +You only need to include the `date_day` column in the table. MetricFlow can handle broader levels of detail, but it doesn't currently support finer grains. diff --git a/website/docs/docs/build/metrics-overview.md b/website/docs/docs/build/metrics-overview.md new file mode 100644 index 00000000000..81af149a7d9 --- /dev/null +++ b/website/docs/docs/build/metrics-overview.md @@ -0,0 +1,184 @@ +--- +title: Creating metrics +id: metrics-overview +description: "Metrics can be defined in the same or separate YAML files from semantic models within the same dbt project repo." +sidebar_label: "Creating metrics" +tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/cumulative" +--- + +Once you've created your semantic models, it's time to start adding metrics! Metrics can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo) + +The keys for metrics definitions are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | Provide the reference name for the metric. This name must be unique amongst all metrics. | Required | +| `description` | Provide the description for your metric. | Optional | +| `type` | Define the type of metric, which can be `simple`, `ratio`, `cumulative`, or `derived`. | Required | +| `type_params` | Additional parameters used to configure metrics. `type_params` are different for each metric type. | Required | +| `config` | Provide the specific configurations for your metric. | Optional | +| `label` | The display name for your metric. This value will be shown in downstream tools. | Required | +| `filter` | You can optionally add a filter string to any metric type, applying filters to dimensions, entities, or time dimensions during metric computation. Consider it as your WHERE clause. | Optional | +| `meta` | Additional metadata you want to add to your metric. | Optional | + + +Here's a complete example of the metrics spec configuration: + +```yaml +metrics: + - name: metric name ## Required + description: same as always ## Optional + type: the type of the metric ## Required + type_params: ## Required + - specific properties for the metric type + config: here for `enabled` ## Optional + label: The display name for your metric. This value will be shown in downstream tools. ## Required + filter: | ## Optional + {{ Dimension('entity__name') }} > 0 and {{ Dimension(' entity__another_name') }} is not + null +``` + +This page explains the different supported metric types you can add to your dbt project. + + +### Cumulative metrics + +[Cumulative metrics](/docs/build/cumulative) aggregate a measure over a given window. If no window is specified, the window would accumulate the measure over all time. **Note**m, you will need to create the [time spine model](/docs/build/metricflow-time-spine) before you add cumulative metrics. + +```yaml +# Cumulative metrics aggregate a measure over a given window. The window is considered infinite if no window parameter is passed (accumulate the measure over all time) +metrics: + - name: wau_rolling_7 + owners: + - support@getdbt.com + type: cumulative + type_params: + measures: + - distinct_users + #Omitting window will accumulate the measure over all time + window: 7 days + +``` +### Derived metrics + +[Derived metrics](/docs/build/derived) are defined as an expression of other metrics. Derived metrics allow you to do calculations on top of metrics. + +```yaml +metrics: + - name: order_gross_profit + description: Gross profit from each order. + type: derived + label: Order Gross Profit + type_params: + expr: revenue - cost + metrics: + - name: order_total + alias: revenue + - name: order_cost + alias: cost +``` + + +### Ratio metrics + +[Ratio metrics](/docs/build/ratio) involve a numerator metric and a denominator metric. A `constraint` string can be applied, to both numerator and denominator, or applied separately to the numerator or denominator. + +```yaml +# Ratio Metric +metrics: + - name: cancellation_rate + owners: + - support@getdbt.com +# Ratio metrics create a ratio out of two metrics. +# Define the metrics from the semantic manifest as numerator or denominator + type: ratio + type_params: + numerator: cancellations + denominator: transaction_amount + filter: | # add optional constraint string. This applies to both the numerator and denominator + {{ Dimension('customer__country') }} = 'MX' + - name: enterprise_cancellation_rate + owners: + - support@getdbt.com + # Ratio metrics create a ratio out of two measures. + # Define the metrics from the semantic model as numerator or denominator + type: ratio + type_params: + numerator: + name: cancellations + filter: {{ Dimension('company__tier' )}} = 'enterprise' # constraint only applies to the numerator + denominator: transaction_amount + filter: | # add optional constraint string. This applies to both the numerator and denominator + {{ Dimension('customer__country') }} = 'MX' +``` +### Simple metrics + +[Simple metrics](/docs/build/simple) point directly to a measure. You may think of it as a function that takes only one measure as the input. + +- `name`— Use this parameter to define the reference name of the metric. The name must be unique amongst metrics and can include lowercase letters, numbers, and underscores. You can use this name to call the metric from the dbt Semantic Layer API. + + +```yaml +metrics: + - name: cancellations + type: simple + type_params: + measure: cancellations_usd # Specify the measure you are creating a proxy for. + filter: | + {{ Dimension('order__value')}} > 100 and {{Dimension('user__acquisition')}} +``` + +## Filters + +A filter is configured using Jinja templating. Use the following syntax to reference entities, dimensions, and time dimensions in filters: +```yaml +filter: | + {{ Entity('entity_name') }} +filter: | + {{ Dimension('primary_entity__dimension_name') }} +filter: | + {{ TimeDimension('time_dimension', 'granularity') }} +``` +### Further configuration + +You can set more metadata for your metrics, which can be used by other tools later on. The way this metadata is used will vary based on the specific integration partner + +- **Description** — Write a detailed description of the metric. + + + + +## Related docs + +- [Semantic models](/docs/build/semantic-models) +- [Cumulative](/docs/build/cumulative) +- [Derived](/docs/build/derived) + + + + diff --git a/website/docs/docs/build/metrics.md b/website/docs/docs/build/metrics.md index 681aec63dca..7a505fdad14 100644 --- a/website/docs/docs/build/metrics.md +++ b/website/docs/docs/build/metrics.md @@ -4,18 +4,37 @@ id: "metrics" description: "When you define metrics in dbt projects, you encode crucial business logic in tested, version-controlled code. The dbt metrics layer helps you standardize metrics within your organization." keywords: - dbt metrics layer +tags: [Metrics] --- - +:::caution Upgrade to access MetricFlow and the new dbt Semantic Layer -* **v1.3.0**: Metrics have been moved out of the experimental phase -* **v1.0.0**: Metrics are new and experimental +The dbt_metrics package has been deprecated and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6). If you're using the dbt_metrics package or the legacy Semantic Layer (available on v1.5 or lower), we **highly** recommend [upgrading your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to access MetricFlow and the new [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl?version=1.6). - +To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. + +::: + + + +The dbt Semantic Layer has undergone a [significant revamp](https://www.getdbt.com/blog/dbt-semantic-layer-whats-next/), improving governance, introducing new APIs, and making it more efficient to define/query metrics. This revamp means the dbt_metrics package and the legacy Semantic Layer, available in dbt v1.5 or lower, are no longer supported and won't receive any code fixes. + +**What’s changed?**

+The dbt_metrics package has been [deprecated](https://docs.getdbt.com/blog/deprecating-dbt-metrics) and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. We will also remove the dbt_metrics spec and docs when it's fully deprecated. + +**Who does this affect?**

+Anyone who uses the dbt_metrics package or is integrated with the legacy Semantic Layer. The new Semantic Layer is available to [Team or Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plans [hosted in North America](/docs/cloud/about-cloud/regions-ip-addresses). You must be on dbt v1.6 or higher to access it. All users can define metrics using MetricFlow. Users on dbt Cloud Developer plans or dbt Core can only use it to define and test metrics locally, but can't dynamically query them with integrated tools. + +**What should you do?**

+If you've defined metrics using dbt_metrics or integrated with the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use MetricFlow or the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. + + +
+ + -## About Metrics -A metric is a timeseries aggregation over a that supports zero or more dimensions. Some examples of metrics include: +A metric is an aggregation over a that supports zero or more dimensions. Some examples of metrics include: - active users - monthly recurring revenue (mrr) @@ -42,7 +61,7 @@ You can define metrics in `.yml` files nested under a `metrics:` key. Metric nam - begin with a letter - contain no more than 250 characters -For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. More examples and guidance for how to [define and structure metrics can be found here.](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics). +For a short human-friendly name with title casing, spaces, and special characters, use the `label` property. ### Example definition @@ -69,7 +88,7 @@ metrics: expression: user_id timestamp: signup_date - time_grains: [day, week, month, quarter, year, all_time] + time_grains: [day, week, month, quarter, year] dimensions: - plan @@ -151,7 +170,7 @@ metrics: :::caution -- You cannot define metrics on [ephemeral models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations#ephemeral). To define a metric, the materialization must have a representation in the data warehouse. +- You cannot define metrics on [ephemeral models](https://docs.getdbt.com/docs/build/materializations#ephemeral). To define a metric, the materialization must have a representation in the data warehouse. ::: @@ -168,9 +187,9 @@ Metrics can have many declared **properties**, which define aspects of your metr | label | A short for name / label for the metric | New Customers | yes | | description | Long form, human-readable description for the metric | The number of customers who.... | no | | calculation_method | The method of calculation (aggregation or derived) that is applied to the expression | count_distinct | yes | -| expression | The expression to aggregate/calculate over | user_id, cast(user_id as int) | yes | -| timestamp | The time-based component of the metric | signup_date | yes | -| time_grains | One or more "grains" at which the metric can be evaluated. For more information, see the "Custom Calendar" section. | [day, week, month, quarter, year] | yes | +| expression | The expression to aggregate/calculate over | user_id, cast(user_id as int) |yes | +| timestamp | The time-based component of the metric | signup_date | no yes | +| time_grains | One or more "grains" at which the metric can be evaluated. For more information, see the "Custom Calendar" section. | [day, week, month, quarter, year] | no yes | | dimensions | A list of dimensions to group or filter the metric by | [plan, country] | no | | window | A dictionary for aggregating over a window of time. Used for rolling metrics such as 14 day rolling average. Acceptable periods are: [`day`,`week`,`month`, `year`, `all_time`] | {count: 14, period: day} | no | | filters | A list of filters to apply before calculating the metric | See below | no | @@ -201,14 +220,17 @@ Metrics can have many declared **properties**, which define aspects of your metr ### Available calculation methods + The method of calculation (aggregation or derived) that is applied to the expression. + + The type of calculation (aggregation or expression) that is applied to the sql property. -| Metric Calculation Method Metric Type | Description | +| Metric Calculation Method | Description | |----------------|----------------------------------------------------------------------------| | count | This metric type will apply the `count` aggregation to the specified field | | count_distinct | This metric type will apply the `count` aggregation to the specified field, with an additional distinct statement inside the aggregation | @@ -216,6 +238,7 @@ The type of calculation (aggregation or expression) that is applied to the sql p | average | This metric type will apply the `average` aggregation to the specified field | | min | This metric type will apply the `min` aggregation to the specified field | | max | This metric type will apply the `max` aggregation to the specified field | +| median | This metric type will apply the `median` aggregation to the specified field, or an alternative `percentile_cont` aggregation if `median` is not available | |derived expression | This metric type is defined as any _non-aggregating_ calculation of 1 or more metrics | @@ -306,7 +329,7 @@ Filters should be defined as a list of dictionaries that define predicates for t All three properties (`field`, `operator`, `value`) are required for each defined filter. -Note that `value` must be defined as a string in YAML, because it will be compiled into queries as part of a string. If your filter's value needs to be surrounded in quotes inside the query (e.g. text or dates), use `"'nested'"` quotes: +Note that `value` must be defined as a string in YAML, because it will be compiled into queries as part of a string. If your filter's value needs to be surrounded by quotes inside the query (e.g. text or dates), use `"'nested'"` quotes: ```yml filters: @@ -324,42 +347,122 @@ Note that `value` must be defined as a string in YAML, because it will be compil value: "'2020-01-01'" ``` -## Querying Your Metric -You can dynamically query metrics directly in dbt and verify them before running a job in the deployment environment. To query your defined metric, you must have the [dbt_metrics package](https://github.com/dbt-labs/dbt_metrics) installed. Information on how to [install packages can be found here](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#how-do-i-add-a-package-to-my-project). +### Calendar +The dbt_metrics package contains a [basic calendar table](https://github.com/dbt-labs/dbt_metrics/blob/main/models/dbt_metrics_default_calendar.sql) that is created as part of your `dbt run`. It contains dates between 2010-01-01 and 2029-12-31. -Use the following [metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/) installation code in your packages.yml file and run `dbt deps` to install the metrics package: +If you want to use a custom calendar, you can replace the default with any table which meets the following requirements: +- Contains a `date_day` column. +- Contains the following columns: `date_week`, `date_month`, `date_quarter`, `date_year`, or equivalents. +- Additional date columns need to be prefixed with `date_`, e.g. `date_4_5_4_month` for a 4-5-4 retail calendar date set. Dimensions can have any name (see following section). - +To do this, set the value of the `dbt_metrics_calendar_model` variable in your `dbt_project.yml` file: +```yaml +#dbt_project.yml +config-version: 2 +[...] +vars: + dbt_metrics_calendar_model: my_custom_calendar +``` + +#### Dimensions from calendar tables +You may want to aggregate metrics by a dimension in your custom calendar table, for example is_weekend. You can include this within the list of dimensions in the macro call without it needing to be defined in the metric definition. + +To do so, set a list variable at the project level called custom_calendar_dimension_list, as shown in the example below. + +```yaml +#dbt_project.yml +vars: + custom_calendar_dimension_list: ["is_weekend"] +``` + + + +### Configuration + +Metric nodes now accept `config` dictionaries like other dbt resources. Specify Metric configs in the metric yml itself, or for groups of metrics in the `dbt_project.yml` file. + + + + + + + ```yml -packages: - - package: dbt-labs/metrics - version: [">=1.3.0", "<1.4.0"] +version: 2 +metrics: + - name: config_metric + label: Example Metric with Config + model: ref(‘my_model’) + calculation_method: count + timestamp: date_field + time_grains: [day, week, month] + config: + enabled: true +``` + + + + + + + + +```yml +metrics: + your_project_name: + +enabled: true ``` + + + + + + + +#### Accepted Metric Configurations + +The following is the list of currently accepted metric configs: + +| Config | Type | Accepted Values | Default Value | Description | +|--------|------|-----------------|---------------|-------------| +| `enabled` | boolean | True/False | True | Enables or disables a metric node. When disabled, dbt will not consider it as part of your project. | +| `treat_null_values_as_zero` | boolean | True/False | True | Controls the `coalesce` behavior for metrics. By default, when there are no observations for a metric, the output of the metric as well as [Period over Period](#secondary-calculations) secondary calculations will include a `coalesce({{ field }}, 0)` to return 0's rather than nulls. Setting this config to False instead returns `NULL` values. | + - +## Querying Your Metric + +:::caution dbt_metrics is no longer supported +The dbt_metrics package has been deprecated and replaced with [MetricFlow](/docs/build/about-metricflow?version=1.6), a new way framework for defining metrics in dbt. This means dbt_metrics is no longer supported after dbt v1.5 and won't receive any code fixes. +::: + +You can dynamically query metrics directly in dbt and verify them before running a job in the deployment environment. To query your defined metric, you must have the [dbt_metrics package](https://github.com/dbt-labs/dbt_metrics) installed. Information on how to [install packages can be found here](https://docs.getdbt.com/docs/build/packages#how-do-i-add-a-package-to-my-project). + +Use the following [metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/) installation code in your packages.yml file and run `dbt deps` to install the metrics package: + + ```yml packages: - package: dbt-labs/metrics - version: [">=0.3.0", "<0.4.0"] + version: [">=1.3.0", "<1.4.0"] ``` - + ```yml packages: - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] + version: [">=0.3.0", "<0.4.0"] ``` -Once the package has been installed with `dbt deps`, make sure to run the `dbt_metrics_calendar_model` model as this is required for macros used to query metrics. More information on this, and additional calendar functionality, can be found in the [project README](https://github.com/dbt-labs/dbt_metrics#calendar). +Once the package has been installed with `dbt deps`, make sure to run the `dbt_metrics_default_calendar` model as this is required for macros used to query metrics. More information on this, and additional calendar functionality, can be found in the [project README](https://github.com/dbt-labs/dbt_metrics#calendar). ### Querying metrics with `metrics.calculate` Use the `metrics.calculate` macro along with defined metrics to generate a SQL statement that runs the metric aggregation to return the correct metric dataset. Example below: @@ -377,19 +480,6 @@ from {{ metrics.calculate( - - -```sql -select * -from {{ metrics.calculate( - metric_name='new_customers', - grain='week', - dimensions=['plan', 'country'] -) }} -``` - - - ### Supported inputs The example above doesn't display all the potential inputs you can provide to the macro. @@ -398,19 +488,82 @@ You may find some pieces of functionality, like secondary calculations, complica | Input | Example | Description | Required | | ----------- | ----------- | ----------- | -----------| -| metric_listmetric_name | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
`'metric_name'`
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format.The name of the metric | Required | -| grain | `'day'`, `'week'`,
`'month'`, `'quarter'`,
`'year'`, `'all_time'`
| The time grain that the metric will be aggregated to in the returned dataset | Required | +| metric_list | `metric('some_metric)'`,
[`metric('some_metric)'`,
`metric('some_other_metric)'`]
| The metric(s) to be queried by the macro. If multiple metrics required, provide in list format. | Required | +| grain | `'day'`, `'week'`,
`'month'`, `'quarter'`,
`'year'`
| The time grain that the metric will be aggregated to in the returned dataset | Optional | | dimensions | [`'plan'`,
`'country'`] | The dimensions you want the metric to be aggregated by in the returned dataset | Optional | -| secondary_calculations | [`metrics.period_over_period( comparison_strategy="ratio", interval=1, alias="pop_1wk")`] | Performs the specified secondary calculation on the metric results. Examples include period over period calculations, rolling calcultions, and period to date calculations. | Optional | +| secondary_calculations | [`metrics.period_over_period( comparison_strategy="ratio", interval=1, alias="pop_1wk")`] | Performs the specified secondary calculation on the metric results. Examples include period over period calculations, rolling calculations, and period to date calculations. | Optional | | start_date | `'2022-01-01'` | Limits the date range of data used in the metric calculation by not querying data before this date | Optional | -| end_date | `'2022-12-31'` | Limits the date range of data used in the metric claculation by not querying data after this date | Optional | -| where | `plan='paying_customer'` | A sql statment, or series of sql statements, that alter the **final** CTE in the generated sql. Most often used to limit the data to specific values of dimensions provided | Optional | +| end_date | `'2022-12-31'` | Limits the date range of data used in the metric calculation by not querying data after this date | Optional | +| where | `plan='paying_customer'` | A sql statement, or series of sql statements, that alter the **final** CTE in the generated sql. Most often used to limit the data to specific values of dimensions provided | Optional | -#### Secondary Calculations +### Secondary Calculations Secondary calculations are window functions you can add to the metric calculation and perform on the primary metric or metrics. You can use them to compare values to an earlier period, calculate year-to-date sums, and return rolling averages. You can add custom secondary calculations into dbt projects - for more information on this, reference the [package README](https://github.com/dbt-labs/dbt_metrics#secondary-calculations). +The supported Secondary Calculations are: + +#### Period over Period: + +The period over period secondary calculation performs a calculation against the metric(s) in question by either determining the difference or the ratio between two points in time. The input variable, which looks at the grain selected in the macro, determines the other point. + +| Input | Example | Description | Required | +| -------------------------- | ----------- | ----------- | -----------| +| `comparison_strategy` | `ratio` or `difference` | How to calculate the delta between the two periods | Yes | +| `interval` | 1 | Integer - the number of time grains to look back | Yes | +| `alias` | `week_over_week` | The column alias for the resulting calculation | No | +| `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | + +#### Period to Date: + +The period to date secondary calculation performs an aggregation on a defined period of time that is equal to or higher than the grain selected. For example, when you want to display a month_to_date value alongside your weekly grained metric. + +| Input | Example | Description | Required | +| -------------------------- | ----------- | ----------- | -----------| +| `aggregate` | `max`, `average` | The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](https://github.com/dbt-labs/dbt_metrics/blob/main/macros/validation/validate_aggregate_coherence.sql). | Yes | +| `period` | `"day"`, `"week"` | The time grain to aggregate to. One of [`"day"`, `"week"`, `"month"`, `"quarter"`, `"year"`]. Must be at equal or coarser (higher, more aggregated) granularity than the metric's grain (see [Time Grains](#time-grains) below). In example grain of `month`, the acceptable periods would be `month`, `quarter`, or `year`. | Yes | +| `alias` | `month_to_date` | The column alias for the resulting calculation | No | +| `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | + +#### Rolling: + + + +The rolling secondary calculation performs an aggregation on a number of rows in metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4` then the value returned will be a rolling 4 week calculation of whatever aggregation type was selected. If the `interval` input is not provided then the rolling caclulation will be unbounded on all preceding rows. + +| Input | Example | Description | Required | +| -------------------------- | ----------- | ----------- | -----------| +| `aggregate` | `max`, `average` | The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](https://github.com/dbt-labs/dbt_metrics/blob/main/macros/validation/validate_aggregate_coherence.sql). | Yes | +| `interval` | 1 | Integer - the number of time grains to look back | No | +| `alias` | `month_to_date` | The column alias for the resulting calculation | No | +| `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | + + + + +The rolling secondary calculation performs an aggregation on a number of rows in the metric dataset. For example, if the user selects the `week` grain and sets a rolling secondary calculation to `4`, then the value returned will be a rolling 4-week calculation of whatever aggregation type was selected. + +| Input | Example | Description | Required | +| -------------------------- | ----------- | ----------- | -----------| +| `aggregate` | `max`, `average` | The aggregation to use in the window function. Options vary based on the primary aggregation and are enforced in [validate_aggregate_coherence()](https://github.com/dbt-labs/dbt_metrics/blob/main/macros/validation/validate_aggregate_coherence.sql). | Yes | +| `interval` | 1 | Integer - the number of time grains to look back | Yes | +| `alias` | `month_to_date` | The column alias for the resulting calculation | No | +| `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | + + + + +#### Prior: +The prior secondary calculation returns the value from a specified number of intervals before the row. + +| Input | Example | Description | Required | +| -------------------------- | ----------- | ----------- | -----------| +| `interval` | 1 | Integer - the number of time grains to look back | Yes | +| `alias` | `2_weeks_prior` | The column alias for the resulting calculation | No | +| `metric_list` | `base_sum_metric` | List of metrics that the secondary calculation should be applied to. Default is all metrics selected | No | + + + ### Developing metrics with `metrics.develop` @@ -487,12 +640,64 @@ from {{ metrics.develop( - +#### Multiple/Derived Metrics with `metrics.develop` +If you have a more complicated use case that you are interested in testing, the develop macro also supports this behavior. The only caveat is that you must include the raw tags for any provided metric yml that contains a derived metric. Example below: -Functionality for `develop` is only supported in v1.2 and higher. Please navigate to those versions for information about this method of metric development. +``` +{% set my_metric_yml -%} +{% raw %} + +metrics: + - name: develop_metric + model: ref('fact_orders') + label: Total Discount ($) + timestamp: order_date + time_grains: [day, week, month] + calculation_method: average + expression: discount_total + dimensions: + - had_discount + - order_country + + - name: derived_metric + label: Total Discount ($) + timestamp: order_date + time_grains: [day, week, month] + calculation_method: derived + expression: "{{ metric('develop_metric') }} - 1 " + dimensions: + - had_discount + - order_country + + - name: some_other_metric_not_using + label: Total Discount ($) + timestamp: order_date + time_grains: [day, week, month] + calculation_method: derived + expression: "{{ metric('derived_metric') }} - 1 " + dimensions: + - had_discount + - order_country + +{% endraw %} +{%- endset %} + +select * +from {{ metrics.develop( + develop_yml=my_metric_yml, + metric_list=['derived_metric'] + grain='month' + ) + }} +``` + +The above example will return a dataset that contains the metric provided in the metric list (`derived_metric`) and the parent metric (`develop_metric`). It will not contain `some_other_metric_not_using` as it is not designated in the metric list or a parent of the metrics included. + +**Important caveat** - You _must_ wrap the `expression` property for `derived` metrics in double quotes to render it. For example, `expression: "{{ metric('develop_metric') }} - 1 "`. + - + diff --git a/website/docs/docs/build/models.md b/website/docs/docs/build/models.md index 029e2856abb..d10eb5ed01a 100644 --- a/website/docs/docs/build/models.md +++ b/website/docs/docs/build/models.md @@ -1,6 +1,9 @@ --- title: "About dbt models" +description: "Read this tutorial to learn how to use models when building in dbt." id: "models" +pagination_next: "docs/build/sql-models" +pagination_prev: null --- ## Overview @@ -17,4 +20,4 @@ The top level of a dbt workflow is the project. A project is a directory of a `. Your organization may need only a few models, but more likely you’ll need a complex structure of nested models to transform the required data. A model is a single file containing a final `select` statement, and a project can have multiple models, and models can even reference each other. Add to that, numerous projects and the level of effort required for transforming complex data sets can improve drastically compared to older methods. -Learn more about models in [SQL models](/docs/build/sql-models) and [Python models](/docs/build/python-models) pages. If you'd like to begin with a bit of practice, visit our [Getting Started Guide](/docs/get-started/getting-started/overview) for instructions on setting up the Jaffle_Shop sample data so you can get hands-on with the power of dbt. +Learn more about models in [SQL models](/docs/build/sql-models) and [Python models](/docs/build/python-models) pages. If you'd like to begin with a bit of practice, visit our [Getting Started Guide](/quickstarts) for instructions on setting up the Jaffle_Shop sample data so you can get hands-on with the power of dbt. diff --git a/website/docs/docs/build/organize-your-outputs.md b/website/docs/docs/build/organize-your-outputs.md new file mode 100644 index 00000000000..ad5efeda1c7 --- /dev/null +++ b/website/docs/docs/build/organize-your-outputs.md @@ -0,0 +1,38 @@ +--- +title: "Organize your outputs" +description: "Learn how you can organize your outputs" +pagination_next: "docs/build/custom-schemas" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/build/packages.md b/website/docs/docs/build/packages.md index 3a77ce310b4..8d18a55e949 100644 --- a/website/docs/docs/build/packages.md +++ b/website/docs/docs/build/packages.md @@ -3,7 +3,7 @@ title: "Packages" id: "packages" --- -## What is a package? + Software engineers frequently modularize code into libraries. These libraries help programmers operate with leverage: they can spend more time focusing on their unique business logic, and less time implementing code that someone else has already spent the time perfecting. In dbt, libraries like these are called _packages_. dbt's packages are so powerful because so many of the analytic problems we encountered are shared across organizations, for example: @@ -22,12 +22,24 @@ dbt _packages_ are in fact standalone dbt projects, with models and macros that * Models in the package will be materialized when you `dbt run`. * You can use `ref` in your own models to refer to models from the package. * You can use macros in the package in your own project. +* It's important to note that defining and installing dbt packages is different from [defining and installing Python packages](/docs/build/python-models#using-pypi-packages) + + +:::info `dependencies.yml` has replaced `packages.yml` +Starting from dbt v1.6, `dependencies.yml` has replaced `packages.yml`. This file can now contain both types of dependencies: "package" and "project" dependencies. +- "Package" dependencies lets you add source code from someone else's dbt project into your own, like a library. +- "Project" dependencies provide a different way to build on top of someone else's work in dbt. Refer to [Project dependencies](/docs/collaborate/govern/project-dependencies) for more info. +- +You can rename `packages.yml` to `dependencies.yml`, _unless_ you need to use Jinja within your packages specification. This could be necessary, for example, if you want to add an environment variable with a git token in a private git package specification. + +::: + ## How do I add a package to my project? -1. Add a `packages.yml` file to your dbt project. This should be at the same level as your `dbt_project.yml` file. +1. Add a file named `dependencies.yml` or `packages.yml` to your dbt project. This should be at the same level as your `dbt_project.yml` file. 2. Specify the package(s) you wish to add using one of the supported syntaxes, for example: - + ```yaml packages: @@ -42,11 +54,7 @@ packages: - - -- **v1.0.0:** The default [`packages-install-path`](packages-install-path) has been updated to be `dbt_packages` instead of `dbt_modules`. - - +The default [`packages-install-path`](/reference/project-configs/packages-install-path) is `dbt_packages`. 3. Run `dbt deps` to install the package(s). Packages get installed in the `dbt_packages` directory – by default this directory is ignored by git, to avoid duplicating the source code for the package. @@ -83,13 +91,6 @@ In comparison, other package installation methods are unable to handle the dupli #### Prerelease versions - - -* `v0.20.1`: Fixed handling for prerelease versions. Introduced `install-prerelease` parameter. -* `v1.0.0`: When you provide an explicit prerelease version, dbt will install that version. - - - Some package maintainers may wish to push prerelease versions of packages to the dbt Hub, in order to test out new functionality or compatibility with a new version of dbt. A prerelease version is demarcated by a suffix, such as `a1` (first alpha), `b2` (second beta), or `rc3` (third release candidate). By default, `dbt deps` will not include prerelease versions when resolving package dependencies. You can enable the installation of prereleases in one of two ways: @@ -124,17 +125,19 @@ packages: - - -* `v0.20.0`: Introduced the ability to specify commit hashes as package revisions - - - Add the Git URL for the package, and optionally specify a revision. The revision can be: - a branch name - a tagged release - a specific commit (full 40-character hash) +Example of a revision specifying a 40-character hash: + +```yaml +packages: + - git: "https://github.com/dbt-labs/dbt-utils.git" + revision: 4e28d6da126e2940d17f697de783a717f2503188 +``` + We **strongly recommend** "pinning" your package to a specific release by specifying a release name. If you do not provide a revision, or if you use `master`, then any updates to the package will be incorporated into your project the next time you run `dbt deps`. While we generally try to avoid making breaking changes to these packages, they are sometimes unavoidable. Pinning a package revision helps prevent your code from changing without your explicit approval. @@ -143,6 +146,23 @@ To find the latest release for a package, navigate to the `Releases` tab in the As of v0.14.0, dbt will warn you if you install a package using the `git` syntax without specifying a version (see below). + + +### Internally hosted tarball URL + +Some organizations have security requirements to pull resources only from internal services. To address the need to install packages from hosted environments such as Artifactory or cloud storage buckets, dbt Core enables you to install packages from internally-hosted tarball URLs. + + +```yaml +packages: + - tarball: https://codeload.github.com/dbt-labs/dbt-utils/tar.gz/0.9.6 + name: 'dbt_utils' +``` + +Where `name: 'dbt_utils'` specifies the subfolder of `dbt_packages` that's created for the package source code to be installed within. + + + ### Private packages #### SSH Key Method (Command Line only) @@ -160,7 +180,7 @@ packages:
-If you're using dbt Cloud, the SSH key method will not work, but you can use the [HTTPS Git Token Method](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#git-token-method). +If you're using dbt Cloud, the SSH key method will not work, but you can use the [HTTPS Git Token Method](https://docs.getdbt.com/docs/build/packages#git-token-method). #### Git Token Method @@ -177,7 +197,14 @@ In GitHub: ```yaml packages: + # use this format when accessing your repository via a github application token + - git: "https://{{env_var('DBT_ENV_SECRET_GIT_CREDENTIAL')}}@github.com/dbt-labs/awesome_repo.git" # git HTTPS URL + + # use this format when accessing your repository via a classical personal access token - git: "https://{{env_var('DBT_ENV_SECRET_GIT_CREDENTIAL')}}@github.com/dbt-labs/awesome_repo.git" # git HTTPS URL + + # use this format when accessing your repository via a fine-grained personal access token (username sometimes required) + - git: "https://GITHUB_USERNAME:{{env_var('DBT_ENV_SECRET_GIT_CREDENTIAL')}}@github.com/dbt-labs/awesome_repo.git" # git HTTPS URL ``` @@ -225,15 +252,9 @@ Read more about creating a Personal Access Token [here](https://confluence.atlas -#### Project subdirectories - - - -* `v0.20.0`: Introduced the ability to specify `subdirectory` - - +#### Configure subdirectory for packaged projects -In general, dbt expects `dbt_project.yml` to be located as a top-level file in a package. If the project is instead nested in a subdirectory—perhaps within a much larger monorepo—you can optionally specify the folder path as `subdirectory`. dbt will attempt a [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) of just the files located within that subdirectory. Note that you must be using a recent version of `git` (`>=2.25.0`). +In general, dbt expects `dbt_project.yml` to be located as a top-level file in a package. If the packaged project is instead nested in a subdirectory—perhaps within a much larger mono repo—you can optionally specify the folder path as `subdirectory`. dbt will attempt a [sparse checkout](https://git-scm.com/docs/git-sparse-checkout) of just the files located within that subdirectory. Note that you must be using a recent version of `git` (`>=2.26.0`). @@ -246,25 +267,42 @@ packages: ### Local packages -Packages that you have stored locally can be installed by specifying the path to the project, like so: +A "local" package is a dbt project accessible from your local file system. You can install it by specifying the project's path. It works best when you nest the project within a subdirectory relative to your current project's directory. ```yaml packages: - - local: /opt/dbt/redshift # use a local path + - local: relative/path/to/subdirectory ``` -Local packages should only be used for specific situations, for example, when testing local changes to a package. +Other patterns may work in some cases, but not always. For example, if you install this project as a package elsewhere, or try running it on a different system, the relative and absolute paths will yield the same results. + + + +```yaml +packages: + # not recommended - support for these patterns vary + - local: /../../redshift # relative path to a parent directory + - local: /opt/dbt/redshift # absolute path on the system +``` + + + +There are a few specific use cases where we recommend using a "local" package: +1. **Monorepo** — When you have multiple projects, each nested in a subdirectory, within a monorepo. "Local" packages allow you to combine projects for coordinated development and deployment. +2. **Testing changes** — To test changes in one project or package within the context of a downstream project or package that uses it. By temporarily switching the installation to a "local" package, you can make changes to the former and immediately test them in the latter for quicker iteration. This is similar to [editable installs](https://pip.pypa.io/en/stable/topics/local-project-installs/) in Python. +3. **Nested project** — When you have a nested project that defines fixtures and tests for a project of utility macros, like [the integration tests within the `dbt-utils` package](https://github.com/dbt-labs/dbt-utils/tree/main/integration_tests). + ## What packages are available? Check out [dbt Hub](https://hub.getdbt.com) to see the library of published dbt packages! ## Advanced package configuration ### Updating a package -When you update a version or revision in your `packages.yml` file, it isn't automatically updated in your dbt project. You should run `dbt deps` to update the package. You may also need to run a [full refresh](run) of the models in this package. +When you update a version or revision in your `packages.yml` file, it isn't automatically updated in your dbt project. You should run `dbt deps` to update the package. You may also need to run a [full refresh](/reference/commands/run) of the models in this package. ### Uninstalling a package When you remove a package from your `packages.yml` file, it isn't automatically deleted from your dbt project, as it still exists in your `dbt_packages/` directory. If you want to completely uninstall a package, you should either: @@ -334,3 +372,4 @@ packages: ``` + diff --git a/website/docs/docs/build/project-variables.md b/website/docs/docs/build/project-variables.md index 04d713756d7..1fe45b18ac0 100644 --- a/website/docs/docs/build/project-variables.md +++ b/website/docs/docs/build/project-variables.md @@ -1,9 +1,10 @@ --- title: "Project variables" id: "project-variables" +pagination_next: "docs/build/environment-variables" --- -dbt provides a mechanism, [variables](reference/dbt-jinja-functions/var), to provide data to models for +dbt provides a mechanism, [variables](/reference/dbt-jinja-functions/var), to provide data to models for compilation. Variables can be used to [configure timezones](https://github.com/dbt-labs/snowplow/blob/0.3.9/dbt_project.yml#L22), [avoid hardcoding table names](https://github.com/dbt-labs/quickbooks/blob/v0.1.0/dbt_project.yml#L23) or otherwise provide data to models to configure how they are compiled. @@ -17,6 +18,13 @@ Variables can be defined in two ways: ### Defining variables in `dbt_project.yml` + +:::info + +Jinja is not supported within the `vars` config, and all values will be interpreted literally. + +::: + :::info New in v0.17.0 The syntax for specifying vars in the `dbt_project.yml` file has changed in @@ -86,18 +94,32 @@ You can find more information on defining dictionaries with YAML [here](https:// ### Variable precedence -Variables defined with the `--vars` command line argument override variables -defined in the `dbt_project.yml` file. They are globally scoped and will be -accessible to all packages included in the project. +Variables defined with the `--vars` command line argument override variables defined in the `dbt_project.yml` file. They are globally scoped and accessible to the root project and all installed packages. The order of precedence for variable declaration is as follows (highest priority first): + + 1. The variables defined on the command line with `--vars`. -3. The package-scoped variable declaration in the `dbt_project.yml` file -2. The global variable declaration in the `dbt_project.yml` file. +2. The package-scoped variable declaration in the root `dbt_project.yml` file +3. The global variable declaration in the root `dbt_project.yml` file +4. If this node is defined in a package: variable declarations in that package's `dbt_project.yml` file +5. The variable's default argument (if one is provided) + + + + + +1. The variables defined on the command line with `--vars` +2. The package-scoped variable declaration in the root `dbt_project.yml` file +3. The global variable declaration in the root `dbt_project.yml` file 4. The variable's default argument (if one is provided). + + If dbt is unable to find a definition for a variable after checking these four places, then a compilation error will be raised. - +**Note:** Variable scope is based on the node ultimately using that variable. Imagine the case where a model defined in the root project is calling a macro defined in an installed package. That macro, in turn, uses the value of a variable. The variable will be resolved based on the _root project's_ scope, rather than the package's scope. + + diff --git a/website/docs/docs/build/projects.md b/website/docs/docs/build/projects.md index 8c6f11e46c2..b4b04e3334d 100644 --- a/website/docs/docs/build/projects.md +++ b/website/docs/docs/build/projects.md @@ -1,9 +1,11 @@ --- title: "About dbt projects" id: "projects" +pagination_next: null +pagination_prev: null --- -A dbt project informs dbt the context of your project and how to transform your data (build your data sets). By design, dbt enforces the top-level structure of a dbt project such as the `dbt_project.yml` file, the `models` directory, the `snapshots` directory, and so on. Within the directories of the top-level, you can organize your project in any way that meets the needs of your organization and data pipeline. +A dbt project informs dbt about the context of your project and how to transform your data (build your data sets). By design, dbt enforces the top-level structure of a dbt project such as the `dbt_project.yml` file, the `models` directory, the `snapshots` directory, and so on. Within the directories of the top-level, you can organize your project in any way that meets the needs of your organization and data pipeline. At a minimum, all a project needs is the `dbt_project.yml` project configuration file. dbt supports a number of different resources, so a project may also include: @@ -18,9 +20,10 @@ At a minimum, all a project needs is the `dbt_project.yml` project configuration | [sources](/docs/build/sources) | A way to name and describe the data loaded into your warehouse by your Extract and Load tools. | | [exposures](/docs/build/exposures) | A way to define and describe a downstream use of your project. | | [metrics](/docs/build/metrics) | A way for you to define metrics for your project. | +| [groups](/docs/build/groups) | Groups enable collaborative node organization in restricted collections. | | [analysis](/docs/build/analyses) | A way to organize analytical SQL queries in your project such as the general ledger from your QuickBooks. | -When building out the structure of your project, you should consider these impacts to your organization's workflow: +When building out the structure of your project, you should consider these impacts on your organization's workflow: * **How would people run dbt commands** — Selecting a path * **How would people navigate within the project** — Whether as developers in the IDE or stakeholders from the docs @@ -52,11 +55,31 @@ Edit `dbt_project.yml` to set up common project configurations such as: For complete details on project configurations, see [dbt_project.yml](/reference/dbt_project.yml). +## Project subdirectories + +You can use the Project subdirectory option in dbt Cloud to specify a subdirectory in your git repository that dbt should use as the root directory for your project. This is helpful when you have multiple dbt projects in one repository or when you want to organize your dbt project files into subdirectories for easier management. + +To use the Project subdirectory option in dbt Cloud, follow these steps: + +1. Click on the cog icon on the upper right side of the page and click on **Account Settings**. + +2. Under **Projects**, select the project you want to configure as a project subdirectory. + +3. Select **Edit** on the lower right-hand corner of the page. + +4. In the **Project subdirectory** field, add the name of the subdirectory. For example, if your dbt project files are located in a subdirectory called `/finance`, you would enter `finance` as the subdirectory. + + * You can also reference nested subdirectories. For example, if your dbt project files are located in `/teams/finance`, you would enter `teams/finance` as the subdirectory. **Note**: You do not need a leading or trailing `/` in the Project subdirectory field. + +5. Click **Save** when you've finished. + +After configuring the Project subdirectory option, dbt Cloud will use it as the root directory for your dbt project. This means that dbt commands, such as `dbt run` or `dbt test`, will operate on files within the specified subdirectory. If there is no `dbt_project.yml` file in the Project subdirectory, you will be prompted to initialize the dbt project. + ## New projects You can create new projects and [share them](/docs/collaborate/git-version-control) with other people by making them available on a hosted git repository like GitHub, GitLab, and BitBucket. -After you set up a connection with your data platform, you can [initialize your new project in dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud) and start developing. Or, run [dbt init from the command line](/reference/commands/init) to set up your new project. +After you set up a connection with your data platform, you can [initialize your new project in dbt Cloud](/quickstarts) and start developing. Or, run [dbt init from the command line](/reference/commands/init) to set up your new project. During project initialization, dbt creates sample model files in your project directory to help you start developing quickly. @@ -66,7 +89,8 @@ If you want to explore dbt projects more in-depth, you can clone dbt Lab’s [Ja If you want to see what a mature, production project looks like, check out the [GitLab Data Team public repo](https://gitlab.com/gitlab-data/analytics/-/tree/master/transform/snowflake-dbt). + ## Related docs -- [Best practices: How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) -* [Get started with dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud) -* [Get started with dbt Core](/docs/get-started/getting-started-dbt-core) +* [Best practices: How we structure our dbt projects](/guides/best-practices/how-we-structure/1-guide-overview) +* [Quickstarts for dbt Cloud](/quickstarts) +* [Quickstart for dbt Core](/quickstarts/manual-install) diff --git a/website/docs/docs/build/python-models.md b/website/docs/docs/build/python-models.md index 3f213f1fdca..bff65362d06 100644 --- a/website/docs/docs/build/python-models.md +++ b/website/docs/docs/build/python-models.md @@ -2,31 +2,21 @@ title: "Python models" id: "python-models" --- -- [Overview](#overview) -- [Configuring Python Models](#configuring-python-models) -- [Python Specific Functionality](#configuring-python-models) -- [Limitations](#limitations) -- [Supported data platforms](#supported-data-platforms) dbt Core v1.3 adds support for Python models. Note that only [specific data platforms](#specific-data-platforms) support dbt-py models. We encourage you to: - Read [the original discussion](https://github.com/dbt-labs/dbt-core/discussions/5261) that proposed this feature. -- Contribute to [best practices for developing Python models in dbt](https://github.com/dbt-labs/docs.getdbt.com/discussions/1811). -- Weigh in on [next steps for Python models, beyond v1.3](https://github.com/dbt-labs/dbt-core/discussions/5742). -- Join the **#beta-feedback-python-models** channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). +- Contribute to [best practices for developing Python models in dbt](https://discourse.getdbt.com/t/dbt-python-model-dbt-py-best-practices/5204). +- Share your thoughts and ideas on [next steps for Python models](https://github.com/dbt-labs/dbt-core/discussions/5742). +- Join the **#dbt-core-python-models** channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). - -In the following article, you'll see a section titled "❓ **dbt questions**." We are excited to release the first set of functionality in v1.3, which will solve real use cases. We also know this is the first step toward a much wider field of possibility. We don't pretend to have all the answers. - -We're excited to keep developing our opinionated recommendations and next steps for product development, and we want your help. Comment in the GitHub discussions; leave thoughts in Slack; bring up dbt + Python in casual conversation with colleagues and friends. - ## Overview -dbt Python ("dbt-py") models will help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open-source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. Python transformations defined in dbt are models in your project with all the same capabilities around testing, documentation, and lineage. +dbt Python (`dbt-py`) models can help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open-source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. Python transformations defined in dbt are models in your project with all the same capabilities around testing, documentation, and lineage. - + Python models are supported in dbt Core 1.3 and higher. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). @@ -36,7 +26,6 @@ To read more about Python models, change the [docs version to 1.3](/docs/build/p - ```python @@ -78,7 +67,7 @@ models: - not_null tests: # Write your own validation logic (in SQL) for Python results - - [custom_generic_test](writing-custom-generic-tests) + - [custom_generic_test](/guides/best-practices/writing-custom-generic-tests) ``` @@ -88,7 +77,7 @@ models: The prerequisites for dbt Python models include using an adapter for a data platform that supports a fully featured Python runtime. In a dbt Python model, all Python code is executed remotely on the platform. None of it is run by dbt locally. We believe in clearly separating _model definition_ from _model execution_. In this and many other ways, you'll find that dbt's approach to Python models mirrors its longstanding approach to modeling data in SQL. -We've written this guide assuming that you have some familiarity with dbt. If you've never before written a dbt model, we encourage you to start by first reading [dbt Models](building-models). Throughout, we'll be drawing connections between Python models and SQL models, as well as making clear their differences. +We've written this guide assuming that you have some familiarity with dbt. If you've never before written a dbt model, we encourage you to start by first reading [dbt Models](/docs/build/models). Throughout, we'll be drawing connections between Python models and SQL models, as well as making clear their differences. ### What is a Python model? @@ -98,7 +87,7 @@ This is similar to the role of CTEs in dbt SQL models. We Instead of a final `select` statement, each Python model returns a final DataFrame. Each DataFrame operation is "lazily evaluated." In development, you can preview its data, using methods like `.show()` or `.head()`. When you run a Python model, the full result of the final DataFrame will be saved as a table in your data warehouse. -dbt Python models have access to almost all of the same configuration options as SQL models. You can test them, document them, add `tags` and `meta` properties to them, grant access to their results to other users, and so on. You can select them by their name, their file path, configurations, whether they are upstream or downstream of another model, or if they have been modified compared to a previous project state. +dbt Python models have access to almost all of the same configuration options as SQL models. You can test and document them, add `tags` and `meta` properties, and grant access to their results to other users. You can select them by their name, file path, configurations, whether they are upstream or downstream of another model, or if they have been modified compared to a previous project state. ### Defining a Python model @@ -159,6 +148,11 @@ with upstream_python_model as ( +:::caution + +Referencing [ephemeral](/docs/build/materializations#ephemeral) models is currently not supported (see [feature request](https://github.com/dbt-labs/dbt-core/issues/7288)) +::: + ## Configuring Python models Just like SQL models, there are three ways to configure Python models: @@ -179,7 +173,7 @@ def model(dbt, session): -There's a limit to how complex you can get with the `dbt.config()` method. It accepts _only_ literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the [`config` property](resource-properties/config) in a YAML file. +There's a limit to how complex you can get with the `dbt.config()` method. It accepts _only_ literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the [`config` property](/reference/resource-properties/config) in a YAML file. #### Accessing project context @@ -229,7 +223,77 @@ def model(dbt, session): ### Materializations -Python models support dbt Materializations. To learn more about them visit the [Materializations page](/docs/build/materializations.md) +Python models support these materializations: +- `table` (default) +- `incremental` + +Incremental Python models support all the same [incremental strategies](/docs/build/incremental-models#about-incremental_strategy) as their SQL counterparts. The specific strategies supported depend on your adapter. As an example, incremental models are supported on BigQuery with Dataproc for the `merge` incremental strategy; the `insert_overwrite` strategy is not yet supported. + +Python models can't be materialized as `view` or `ephemeral`. Python isn't supported for non-model resource types (like tests and snapshots). + +For incremental models, like SQL models, you need to filter incoming tables to only new rows of data: + + + +
+ + + +```python +import snowflake.snowpark.functions as F + +def model(dbt, session): + dbt.config(materialized = "incremental") + df = dbt.ref("upstream_table") + + if dbt.is_incremental: + + # only new rows compared to max in current table + max_from_this = f"select max(updated_at) from {dbt.this}" + df = df.filter(df.updated_at >= session.sql(max_from_this).collect()[0][0]) + + # or only rows from the past 3 days + df = df.filter(df.updated_at >= F.dateadd("day", F.lit(-3), F.current_timestamp())) + + ... + + return df +``` + + + +
+ +
+ + + +```python +import pyspark.sql.functions as F + +def model(dbt, session): + dbt.config(materialized = "incremental") + df = dbt.ref("upstream_table") + + if dbt.is_incremental: + + # only new rows compared to max in current table + max_from_this = f"select max(updated_at) from {dbt.this}" + df = df.filter(df.updated_at >= session.sql(max_from_this).collect()[0][0]) + + # or only rows from the past 3 days + df = df.filter(df.updated_at >= F.date_add(F.current_timestamp(), F.lit(-3))) + + ... + + return df +``` + + + +
+ +
## Python-specific functionality @@ -254,11 +318,11 @@ def model(dbt, session): -Currently, Python functions defined in one dbt model can't be imported and reused in other models. See the ["Code reuse"](#code-reuse) section for the potential patterns we're considering. +Currently, Python functions defined in one dbt model can't be imported and reused in other models. Refer to [Code reuse](#code-reuse) for the potential patterns being considered. ### Using PyPI packages -You can also define functions that depend on third-party packages so long as those packages are installed and available to the Python runtime on your data platform. See notes on "Installing Packages" for [specific data warehouses](#specific-data-warehouses). +You can also define functions that depend on third-party packages so long as those packages are installed and available to the Python runtime on your data platform. See notes on "Installing Packages" for [specific data platforms](#specific-data-platforms). In this example, we use the `holidays` package to determine if a given date is a holiday in France. The code below uses the pandas API for simplicity and consistency across platforms. The exact syntax, and the need to refactor for multi-node processing, still vary. @@ -290,6 +354,7 @@ def model(dbt, session): # apply our function # (columns need to be in uppercase on Snowpark) df["IS_HOLIDAY"] = df["ORDER_DATE"].apply(is_holiday) + df["ORDER_DATE"].dt.tz_localize('UTC') # convert from Number/Long to tz-aware Datetime # return final dataset (Pandas DataFrame) return df @@ -373,8 +438,8 @@ models: #### User-defined functions (UDFs) You can use the `@udf` decorator or `udf` function to define an "anonymous" function and call it within your `model` function's DataFrame transformation. This is a typical pattern for applying more complex functions as DataFrame operations, especially if those functions require inputs from third-party packages. -- [Snowpark Python: Creating s](https://docs.snowflake.com/en/developer-guide/snowpark/python/creating-udfs.html) -- ["PySpark functions: udf"](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html) +- [Snowpark Python: Creating UDFs](https://docs.snowflake.com/en/developer-guide/snowpark/python/creating-udfs.html) +- [PySpark functions: udf](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html) @@ -425,7 +490,7 @@ def model(dbt, session): ```python -from pyspark.sql.types as T +import pyspark.sql.types as T import pyspark.sql.functions as F import numpy @@ -456,9 +521,10 @@ def model(dbt, session): #### Code reuse -Currently, Python functions defined in one dbt model cannot be imported and reused in other models. This is something wdbt would like to support, so there are two patterns we're considering: -1. Creating and registering **"named" UDFs** — This process is different across data platforms and has some performance limitations. (Snowpark does support ["vectorized" UDFs](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html): Pandas-like functions that can be executed in parallel.) -2. **Private Python packages** — In addition to importing reusable functions from public PyPI packages, many data platforms support uploading custom Python assets and registering them as packages. The upload process looks different across platforms, but your code’s actual `import` looks the same. +Currently, Python functions defined in one dbt model can't be imported and reused in other models. This is something dbt Labs would like to support, so there are two patterns we're considering: + +- Creating and registering **"named" UDFs** — This process is different across data platforms and has some performance limitations. For example, Snowpark supports [vectorized UDFs](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html) for pandas-like functions that you can execute in parallel. +- **Private Python packages** — In addition to importing reusable functions from public PyPI packages, many data platforms support uploading custom Python assets and registering them as packages. The upload process looks different across platforms, but your code’s actual `import` looks the same. :::note ❓ dbt questions @@ -471,7 +537,7 @@ Currently, Python functions defined in one dbt model cannot be imported and reus ### DataFrame API and syntax -Over the past decade, most people writing data transformations in Python have adopted DataFrame as their common abstraction. dbt follows this convention by returning `ref()` and `source()` as DataFrames, and it expects all Python models to return a DataFrame. +Over the past decade, most people writing [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) in Python have adopted DataFrame as their common abstraction. dbt follows this convention by returning `ref()` and `source()` as DataFrames, and it expects all Python models to return a DataFrame. A DataFrame is a two-dimensional data structure (rows and columns). It supports convenient methods for transforming that data and creating new columns from calculations performed on existing columns. It also offers convenient ways for previewing data while developing locally or in a notebook. @@ -481,7 +547,7 @@ When developing a Python model, you will find yourself asking these questions: **Why pandas?** — It's the most common API for DataFrames. It makes it easy to explore sampled data and develop transformations locally. You can “promote” your code as-is into dbt models and run it in production for small datasets. -**Why _not_ pandas?** — Performance. pandas runs "single-node" transformations, which cannot benefit from the parallelism and distributed computing offered by modern data warehouses. This quickly becomes a problem as you operate on larger datasets. Some data platforms support optimizations for code written using pandas' DataFrame API, preventing the need for major refactors. For example, ["pandas on PySpark"](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html) offers support for 95% of pandas functionality, using the same API while still leveraging parallel processing. +**Why _not_ pandas?** — Performance. pandas runs "single-node" transformations, which cannot benefit from the parallelism and distributed computing offered by modern data warehouses. This quickly becomes a problem as you operate on larger datasets. Some data platforms support optimizations for code written using pandas DataFrame API, preventing the need for major refactors. For example, [pandas on PySpark](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html) offers support for 95% of pandas functionality, using the same API while still leveraging parallel processing. :::note ❓ dbt questions - When developing a new dbt Python model, should we recommend pandas-style syntax for rapid iteration and then refactor? @@ -498,10 +564,11 @@ Python models have capabilities that SQL models do not. They also have some draw - **Time and cost.** Python models are slower to run than SQL models, and the cloud resources that run them can be more expensive. Running Python requires more general-purpose compute. That compute might sometimes live on a separate service or architecture from your SQL models. **However:** We believe that deploying Python models via dbt—with unified lineage, testing, and documentation—is, from a human standpoint, **dramatically** faster and cheaper. By comparison, spinning up separate infrastructure to orchestrate Python transformations in production and different tooling to integrate with dbt is much more time-consuming and expensive. - **Syntax differences** are even more pronounced. Over the years, dbt has done a lot, via dispatch patterns and packages such as `dbt_utils`, to abstract over differences in SQL dialects across popular data warehouses. Python offers a **much** wider field of play. If there are five ways to do something in SQL, there are 500 ways to write it in Python, all with varying performance and adherence to standards. Those options can be overwhelming. As the maintainers of dbt, we will be learning from state-of-the-art projects tackling this problem and sharing guidance as we develop it. - **These capabilities are very new.** As data warehouses develop new features, we expect them to offer cheaper, faster, and more intuitive mechanisms for deploying Python transformations. **We reserve the right to change the underlying implementation for executing Python models in future releases.** Our commitment to you is around the code in your model `.py` files, following the documented capabilities and guidance we're providing here. +- **Lack of `print()` support.** The data platform runs and compiles your Python model without dbt's oversight. This means it doesn't display the output of commands such as `print()` in dbt's logs. As a general rule, if there's a transformation you could write equally well in SQL or Python, we believe that well-written SQL is preferable: it's more accessible to a greater number of colleagues, and it's easier to write code that's performant at scale. If there's a transformation you _can't_ write in SQL, or where ten lines of elegant and well-annotated Python could save you 1000 lines of hard-to-read Jinja-SQL, Python is the way to go. -## Supported data platforms +## Specific data platforms {#specific-data-platforms} In their initial launch, Python models are supported on three of the most popular data platforms: Snowflake, Databricks, and BigQuery/GCP (via Dataproc). Both Databricks and GCP's Dataproc use PySpark as the processing framework. Snowflake uses its own framework, Snowpark, which has many similarities to PySpark. @@ -509,11 +576,11 @@ In their initial launch, Python models are supported on three of the most popula
-**Additional setup:** Snowpark Python is in Public Preview - Open and enabled by default for all accounts. You will need to [acknowledge and accept Snowflake Third Party Terms](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#getting-started) to use Anaconda packages. +**Additional setup:** You will need to [acknowledge and accept Snowflake Third Party Terms](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#getting-started) to use Anaconda packages. **Installing packages:** Snowpark supports several popular packages via Anaconda. The complete list is at https://repo.anaconda.com/pkgs/snowflake/. Packages are installed at the time your model is being run. Different models can have different package dependencies. If you are using third-party packages, Snowflake recommends using a dedicated virtual warehouse for best performance rather than one with many concurrent users. -**About "sprocs":** dbt submits Python models to run as "stored procedures," which some people call "sprocs" for short. By default, dbt will create a named sproc containing your model's compiled Python code, and then "call" it to execute. Snowpark has a Private Preview feature for "temporary" or "anonymous" stored procedures ([docs](https://docs.snowflake.com/en/LIMITEDACCESS/call-with.html)), which are faster and leave a cleaner query history. If this feature is enabled for your account, you can switch it on for your models by configuring `use_anonymous_sproc: True`. We plan to switch this on for all dbt + Snowpark Python models in a future release. +**About "sprocs":** dbt submits Python models to run as _stored procedures_, which some people call _sprocs_ for short. By default, dbt will create a named sproc containing your model's compiled Python code, and then _call_ it to execute. Snowpark has an Open Preview feature for _temporary_ or _anonymous_ stored procedures ([docs](https://docs.snowflake.com/en/sql-reference/sql/call-with.html)), which are faster and leave a cleaner query history. You can switch this feature on for your models by configuring `use_anonymous_sproc: True`. We plan to switch this on for all dbt + Snowpark Python models starting with the release of dbt Core version 1.4. @@ -595,10 +662,10 @@ Use the `cluster` submission method with dedicated Dataproc clusters you or your -The following configurations are needed to run Python models on Dataproc. You can add these to your [BigQuery profile](bigquery-profile) or configure them on specific Python models: -- `gcs_bucket`: Storage bucket to which dbt will upload your model's compiled PySpark code -- `dataproc_region`: GCP region in which you have enabled Dataproc (for example `us-central1`) -- `dataproc_cluster_name`: Name of Dataproc cluster to use for running Python model (executing PySpark job). Only required if `submission_method: cluster` +The following configurations are needed to run Python models on Dataproc. You can add these to your [BigQuery profile](/docs/core/connect-data-platform/bigquery-setup#running-python-models-on-dataproc) or configure them on specific Python models: +- `gcs_bucket`: Storage bucket to which dbt will upload your model's compiled PySpark code. +- `dataproc_region`: GCP region in which you have enabled Dataproc (for example `us-central1`). +- `dataproc_cluster_name`: Name of Dataproc cluster to use for running Python model (executing PySpark job). Only required if `submission_method: cluster`. ```python def model(dbt, session): @@ -616,12 +683,16 @@ models: submission_method: serverless ``` +Python models running on Dataproc Serverless can be further configured in your [BigQuery profile](/docs/core/connect-data-platform/bigquery-setup#running-python-models-on-dataproc). + Any user or service account that runs dbt Python models will need the following permissions(in addition to the required BigQuery permissions) ([docs](https://cloud.google.com/dataproc/docs/concepts/iam/iam)): ``` +dataproc.batches.create dataproc.clusters.use dataproc.jobs.create dataproc.jobs.get dataproc.operations.get +dataproc.operations.list storage.buckets.get storage.objects.create storage.objects.delete @@ -645,4 +716,4 @@ You can also install packages at cluster creation time by [defining cluster prop - + \ No newline at end of file diff --git a/website/docs/docs/build/ratio-metrics.md b/website/docs/docs/build/ratio-metrics.md new file mode 100644 index 00000000000..97efe0f55bf --- /dev/null +++ b/website/docs/docs/build/ratio-metrics.md @@ -0,0 +1,127 @@ +--- +id: ratio +title: "Ratio metrics" +description: "Use ratio metrics to create a ratio out of two measures. " +sidebar_label: Ratio +tags: [Metrics, Semantic Layer] +--- + +Ratio allows you to create a ratio between two metrics. You simply specify a numerator and a denominator metric. Additionally, you can apply a dimensional filter to both the numerator and denominator using a constraint string when computing the metric. + + The parameters, description, and type for ratio metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `numerator` | The name of the metric used for the numerator, or structure of properties. | Required | +| `denominator` | The name of the metric used for the denominator, or structure of properties. | Required | +| `filter` | Optional filter for the numerator or denominator. | Optional | +| `alias` | Optional alias for the numerator or denominator. | Optional | + +The following displays the complete specification for ratio metrics, along with an example. + +```yaml +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: ratio # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + numerator: The name of the metric used for the numerator, or structure of properties # Required + name: Name of metric used for the numerator # Required + filter: Filter for the numerator # Optional + alias: Alias for the numerator # Optional + denominator: The name of the metric used for the denominator, or structure of properties # Required + name: Name of metric used for the denominator # Required + filter: Filter for the denominator # Optional + alias: Alias for the denominator # Optional +``` + +## Ratio metrics example + +```yaml +metrics: + - name: food_order_pct + description: "The food order count as a ratio of the total order count" + label: Food Order Ratio + type: ratio + type_params: + numerator: food_orders + denominator: orders + +``` +## Ratio metrics using different semantic models + +The system will simplify and turn the numerator and denominator in a ratio metric from different semantic models by computing their values in sub-queries. It will then join the result set based on common dimensions to calculate the final ratio. Here's an example of the SQL generated for such a ratio metric. + + +```sql +select + subq_15577.metric_time as metric_time + , cast(subq_15577.mql_queries_created_test as double) / cast(nullif(subq_15582.distinct_query_users, 0) as double) as mql_queries_per_active_user +from ( + select + metric_time + , sum(mql_queries_created_test) as mql_queries_created_test + from ( + select + cast(query_created_at as date) as metric_time + , case when query_status in ('PENDING','MODE') then 1 else 0 end as mql_queries_created_test + from prod_dbt.mql_query_base mql_queries_test_src_2552 + ) subq_15576 + group by + metric_time +) subq_15577 +inner join ( + select + metric_time + , count(distinct distinct_query_users) as distinct_query_users + from ( + select + cast(query_created_at as date) as metric_time + , case when query_status in ('MODE','PENDING') then email else null end as distinct_query_users + from prod_dbt.mql_query_base mql_queries_src_2585 + ) subq_15581 + group by + metric_time +) subq_15582 +on + ( + ( + subq_15577.metric_time = subq_15582.metric_time + ) or ( + ( + subq_15577.metric_time is null + ) and ( + subq_15582.metric_time is null + ) + ) + ) +``` + +## Add filter + +Users can define constraints on input metrics for a ratio metric by applying a filter directly to the input metric, like so: + +```yaml +metrics: + - name: frequent_purchaser_ratio + description: Fraction of active users who qualify as frequent purchasers + owners: + - support@getdbt.com + type: ratio + type_params: + numerator: + name: distinct_purchasers + filter: | + {{Dimension('customer__is_frequent_purchaser')}} + alias: frequent_purchasers + denominator: + name: distinct_purchasers +``` + +Note the `filter` and `alias` parameters for the metric referenced in the numerator. Use the `filter` parameter to apply a filter to the metric it's attached to. The `alias` parameter is used to avoid naming conflicts in the rendered SQL queries when the same metric is used with different filters. If there are no naming conflicts, the `alias` parameter can be left out. diff --git a/website/docs/docs/build/saved-queries.md b/website/docs/docs/build/saved-queries.md new file mode 100644 index 00000000000..39a4b2e52fd --- /dev/null +++ b/website/docs/docs/build/saved-queries.md @@ -0,0 +1,43 @@ +--- +title: Saved queries +id: saved-queries +description: "Saved queries are a way to save commonly used queries in MetricFlow. They can be used to save time and avoid writing the same query over and over again." +sidebar_label: "Saved queries" +tags: [Metrics, Semantic Layer] +--- + +:::info Saved queries coming soon +Saved queries isn't currently available in MetricFlow but support is coming soon. +::: + +Saved queries are a way to save commonly used queries in MetricFlow. You can group metrics, dimensions, and filters that are logically related into a saved query. + +To define a saved query, refer to the following specification: + + Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `metrics` | The metrics included in the saved query. | Required | +| `group_bys` | The value displayed in downstream tools. | Required | +| `where` | Filter applied to the query. | Optional | + +The following is an example of a saved query: + +```yaml +saved_query: + name: p0_booking + description: Booking-related metrics that are of the highest priority. + metrics: + - bookings + - instant_bookings + group_bys: + - TimeDimension('metric_time', 'day') + - Dimension('listing__capacity_latest') + where: + - "{{ Dimension('listing__capacity_latest') }} > 3" +``` + +### FAQs + +* All metrics in a saved query need to use the same dimensions in the `group_by` or `where` clauses. diff --git a/website/docs/docs/build/seeds.md b/website/docs/docs/build/seeds.md index bd26c1fceea..6b1abf8f1c3 100644 --- a/website/docs/docs/build/seeds.md +++ b/website/docs/docs/build/seeds.md @@ -1,11 +1,13 @@ --- -title: "Seeds" +title: "Add Seeds to your DAG" +sidebar_label: "Seeds" +description: "Read this tutorial to learn how to use seeds when building in dbt." id: "seeds" --- ## Related reference docs -* [Seed configurations](seed-configs) -* [Seed properties](seed-properties) -* [`seed` command](seed) +* [Seed configurations](/reference/seed-configs) +* [Seed properties](/reference/seed-properties) +* [`seed` command](/docs/build/seeds) ## Overview Seeds are CSV files in your dbt project (typically in your `seeds` directory), that dbt can load into your using the `dbt seed` command. @@ -41,7 +43,7 @@ GB,United Kingdom -2. Run the `dbt seed` [command](seed) command — a new will be created in your warehouse in your target schema, named `country_codes` +2. Run the `dbt seed` [command](/reference/commands/seed) — a new will be created in your warehouse in your target schema, named `country_codes` ``` $ dbt seed @@ -75,15 +77,15 @@ Seeds are configured in your `dbt_project.yml`, check out the [seed configuratio ## Documenting and testing seeds -You can document and test seeds in yaml by declaring properties — check out the docs on [seed properties](seed-properties) for more information. +You can document and test seeds in YAML by declaring properties — check out the docs on [seed properties](/reference/seed-properties) for more information. ## FAQs - - - - - - - - - \ No newline at end of file + + + + + + + + + diff --git a/website/docs/docs/build/semantic-models.md b/website/docs/docs/build/semantic-models.md new file mode 100644 index 00000000000..226c138b545 --- /dev/null +++ b/website/docs/docs/build/semantic-models.md @@ -0,0 +1,248 @@ +--- +title: "Semantic models" +id: "semantic-models" +description: "Semantic models are yml abstractions on top of a dbt mode, connected via joining keys as edges" +keywords: + - dbt metrics layer +sidebar_label: Semantic models +tags: [Metrics, Semantic Layer] +pagination_next: "docs/build/dimensions" +--- + +Semantic models are the foundation for data definition in MetricFlow, which powers the dbt Semantic Layer: + +- Think of semantic models as nodes connected by entities in a semantic graph. +- MetricFlow uses YAML configuration files to create this graph for querying metrics. +- Each semantic model corresponds to a dbt model in your DAG, requiring a unique YAML configuration for each semantic model. +- You can create multiple semantic models from a single dbt model, as long as you give each semantic model a unique name. +- Configure semantic models in a YAML file within your dbt project directory. +- Organize them under a `metrics:` folder or within project sources as needed. + +Semantic models have 6 components and this page explains the definitions with some examples: + +| Component | Description | Type | +| --------- | ----------- | ---- | +| [Name](#name) | Choose a unique name for the semantic model. Avoid using double underscores (__) in the name as they're not supported. | Required | +| [Description](#description) | Includes important details in the description | Optional | +| [Model](#model) | Specifies the dbt model for the semantic model using the `ref` function | Required | +| [Defaults](#defaults) | The defaults for the model, currently only `agg_time_dimension` is supported. | Required | +| [Entities](#entities) | Uses the columns from entities as join keys and indicate their type as primary, foreign, or unique keys with the `type` parameter | Required | +| [Primary Entity](#primary-entity) | If a primary entity exists, this component is Optional. If the semantic model has no primary entity, then this property is required. | Optional | +| [Dimensions](#dimensions) | Different ways to group or slice data for a metric, they can be `time` or `categorical` | Required | +| [Measures](#measures) | Aggregations applied to columns in your data model. They can be the final metric or used as building blocks for more complex metrics | Optional | +| Label | The display name for your semantic model `node`, `dimension`, `entity`, and/or `measures` | Optional | + +## Semantic models components + +The complete spec for semantic models is below: + +```yaml +semantic_models: + - name: the_name_of_the_semantic_model ## Required + description: same as always ## Optional + model: ref('some_model') ## Required + defaults: ## Required + agg_time_dimension: dimension_name ## Required if the model contains dimensions + entities: ## Required + - see more information in entities + measures: ## Optional + - see more information in measures section + dimensions: ## Required + - see more information in dimensions section + primary_entity: >- + if the semantic model has no primary entity, then this property is required. #Optional if a primary entity exists, otherwise Required +``` + +The following example displays a complete configuration and detailed descriptions of each field: + +```yaml +semantic_models: + - name: transaction # A semantic model with the name Transactions + model: ref('fact_transactions') # References the dbt model named `fact_transactions` + description: "Transaction fact table at the transaction level. This table contains one row per transaction and includes the transaction timestamp." + defaults: + agg_time_dimension: transaction_date + + entities: # Entities included in the table are defined here. MetricFlow will use these columns as join keys. + - name: transaction + type: primary + expr: transaction_id + - name: customer + type: foreign + expr: customer_id + + dimensions: # dimensions are qualitative values such as names, dates, or geographical data. They provide context to metrics and allow "metric by group" data slicing. + - name: transaction_date + type: time + type_params: + time_granularity: day + + - name: transaction_location + type: categorical + expr: order_country + + measures: # Measures are columns we perform an aggregation over. Measures are inputs to metrics. + - name: transaction_total + description: "The total value of the transaction." + agg: sum + + - name: sales + description: "The total sale of the transaction." + agg: sum + expr: transaction_total + + - name: median_sales + description: "The median sale of the transaction." + agg: median + expr: transaction_total + + - name: customers # Another semantic model called customers. + model: ref('dim_customers') + description: "A customer dimension table." + + entities: + - name: customer + type: primary + expr: customer_id + + dimensions: + - name: first_name + type: categorical +``` + + + +Semantic models support configs in either the schema file or at the project level. + +Semantic model config in `models/semantic.yml`: +```yml +semantic_models: + - name: orders + config: + enabled: true | false + group: some_group +``` + +Semantic model config in `dbt_project.yml`: +```yml +semantic_models: + my_project_name: + +enabled: true | false + +group: some_group +``` + + + +### Name + +Define the name of the semantic model. You must define a unique name for the semantic model. The semantic graph will use this name to identify the model, and you can update it at any time. Avoid using double underscores (__) in the name as they're not supported. + +### Description + +Includes important details in the description of the semantic model. This description will primarily be used by other configuration contributors. You can use the pipe operator `(|)` to include multiple lines in the description. + +### Model + +Specify the dbt model for the semantic model using the [`ref` function](/reference/dbt-jinja-functions/ref). + +### Defaults + +Defaults for the semantic model. Currently only `agg_time_dimension`. `agg_time_dimension` represents the default time dimensions for measures. This can be overridden by adding the `agg_time_dimension` key directly to a measure - see [Dimensions](/docs/build/dimensions) for examples. +### Entities + +To specify the [entities](/docs/build/entities) in your model, use their columns as join keys and indicate their `type` as primary, foreign, or unique keys with the type parameter. + +### Primary entity + +MetricFlow requires that all dimensions be tied to an entity. This is to guarantee unique dimension names. If your data source doesn't have a primary entity, you need to assign the entity a name using the `primary_entity: entity_name` key. It doesn't necessarily have to map to a column in that table and assigning the name doesn't affect query generation. + +You can define a primary entity using the following configs: + +```yaml +semantic_model: + name: bookings_monthly_source + description: bookings_monthly_source + defaults: + agg_time_dimension: ds + model: ref('bookings_monthly_source') + measures: + - name: bookings_monthly + agg: sum + create_metric: true + primary_entity: booking_id + ``` + + + + + +Here are the types of keys: + +- **Primary** — Only one record per row in the table, and it includes every record in the data platform. +- **Unique** — Only one record per row in the table, but it may have a subset of records in the data platform. Null values may also be present. +- **Foreign** — Can have zero, one, or multiple instances of the same record. Null values may also be present. +- **Natural** — A column or combination of columns in a table that uniquely identifies a record based on real-world data. For example, the `sales_person_id` can serve as a natural key in a `sales_person_department` dimension table. + + + + +This example shows a semantic model with three entities and their entity types: `transaction` (primary), `order` (foreign), and `user` (foreign). + +To reference a desired column, use the actual column name from the model in the `name` parameter. You can also use `name` as an alias to rename the column, and the `expr` parameter to refer to the original column name or a SQL expression of the column. + + +```yaml +entity: + - name: transaction + type: primary + - name: order + type: foreign + expr: id_order + - name: user + type: foreign + expr: substring(id_order FROM 2) +``` + +You can refer to entities (join keys) in a semantic model using the `name` parameter. Entity names must be unique within a semantic model, and identifier names can be non-unique across semantic models since MetricFlow uses them for [joins](/docs/build/join-logic). + + + + +### Dimensions + +[Dimensions](/docs/build/dimensions) are the different ways you can group or slice data for a metric. It can be time-consuming and error-prone to anticipate all possible options in a single table, such as region, country, user role, and so on. + +MetricFlow simplifies this by allowing you to query all metric groups and construct the join during the query. To specify dimensions parameters, include the `name` (either a column or SQL expression) and `type` (`categorical` or `time`). Categorical groups represent qualitative values, while time groups represent dates of varying granularity. + +Dimensions are identified using the name parameter, just like identifiers. The naming of groups must be unique within a semantic model, but not across semantic models since MetricFlow, uses entities to determine the appropriate groups. MetricFlow requires all dimensions be tied to a primary entity. + +:::info For time groups + +For semantic models with a measure, you must have a [primary time group](/docs/build/dimensions#time). + +::: + +### Measures + +[Measures](/docs/build/measures) are aggregations applied to columns in your data model. They can be used as the foundational building blocks for more complex metrics, or be the final metric itself. Measures have various parameters which are listed in a table along with their descriptions and types. + +| Parameter | Description | Field type | +| --- | --- | --- | +| `name`| Provide a name for the measure, which must be unique and can't be repeated across all semantic models in your dbt project. | Required | +| `description` | Describes the calculated measure. | Optional | +| `agg` | dbt supports the following aggregations: `sum`, `max`, `min`, `count_distinct`, and `sum_boolean`. | Required | +| `expr` | You can either reference an existing column in the table or use a SQL expression to create or derive a new one. | Optional | +| `non_additive_dimension` | Non-additive dimensions can be specified for measures that cannot be aggregated over certain dimensions, such as bank account balances, to avoid producing incorrect results. | Optional | +| `create_metric` | You can create a metric directly from a measure with `create_metric: True` and specify its display name with create_metric_display_name. Default is false. | Optional | + + +import SetUpPages from '/snippets/_metrics-dependencies.md'; + + + +## Related docs + +- [About MetricFlow](/docs/build/about-metricflow) +- [Dimensions](/docs/build/dimensions) +- [Entities](/docs/build/entities) +- [Measures](/docs/build/measures) diff --git a/website/docs/docs/build/simple.md b/website/docs/docs/build/simple.md new file mode 100644 index 00000000000..1803e952a69 --- /dev/null +++ b/website/docs/docs/build/simple.md @@ -0,0 +1,62 @@ +--- +title: "Simple metrics" +id: simple +description: "Use simple metrics to directly reference a single measure." +sidebar_label: Simple +tags: [Metrics, Semantic Layer] +pagination_next: null +--- + +Simple metrics are metrics that directly reference a single measure, without any additional measures involved. They are aggregations over a column in your data platform and can be filtered by one or multiple dimensions. + + The parameters, description, and type for simple metrics are: + +| Parameter | Description | Type | +| --------- | ----------- | ---- | +| `name` | The name of the metric. | Required | +| `description` | The description of the metric. | Optional | +| `type` | The type of the metric (cumulative, derived, ratio, or simple). | Required | +| `label` | The value that will be displayed in downstream tools. | Required | +| `type_params` | The type parameters of the metric. | Required | +| `measure` | The measure you're referencing. | Required | + +The following displays the complete specification for simple metrics, along with an example. + + +```yaml +metrics: + - name: The metric name # Required + description: the metric description # Optional + type: simple # Required + label: The value that will be displayed in downstream tools #Required + type_params: # Required + measure: The measure you're referencing # Required + +``` + + + +## Simple metrics example + +```yaml + metrics: + - name: customers + description: Count of customers + type: simple # Pointers to a measure you created in a semantic model + label: Count of customers + type_params: + measure: customers # The measure youre creating a proxy of. + - name: large_orders + description: "Order with order values over 20." + type: SIMPLE + label: Large Orders + type_params: + measure: orders + filter: | # For any metric you can optionally include a filter on dimension values + {{Dimension('customer__order_total_dim')}} >= 20 +``` diff --git a/website/docs/docs/build/sl-getting-started.md b/website/docs/docs/build/sl-getting-started.md new file mode 100644 index 00000000000..34c3479cbc0 --- /dev/null +++ b/website/docs/docs/build/sl-getting-started.md @@ -0,0 +1,96 @@ +--- +id: sl-getting-started +title: Get started with MetricFlow +description: "Learn how to create your first semantic model and metric." +sidebar_label: Get started with MetricFlow +tags: [Metrics, Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs +--- + +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; +import ConnectQueryAPI from '/snippets/_sl-connect-and-query-api.md'; +import RunProdJob from '/snippets/_sl-run-prod-job.md'; + +This getting started page presents a sample workflow to help you create your first metrics in dbt Cloud or the command line interface (CLI). It uses the [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) as the project data source and is available for you to use. + +If you prefer, you can create semantic models and metrics for your own dbt project. This page will guide you on how to: + +- [Create a semantic model](#create-a-semantic-model) using MetricFlow +- [Define metrics](#define-metrics) using MetricFlow +- [Test and query metrics](#test-and-query-metrics) using MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#set-up-dbt-semantic-layer) in dbt Cloud +- [Connect to and query the API](#connect-and-query-api) with dbt Cloud + +MetricFlow allows you to define metrics in your dbt project and query them whether in dbt Cloud or dbt Core with [MetricFlow commands](/docs/build/metricflow-commands). + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. + +## Prerequisites + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +:::tip +New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! +::: + +## Create a semantic model + + + +## Define metrics + + + +## Configure the MetricFlow time spine model + + + +## Test and query metrics + + + +## Run a production job + + + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + +## Connect and query API + + + +## FAQs + +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. + +
+ How do I migrate from the legacy Semantic Layer to the new one? +
+
If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
+
+
+
+How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
+
+Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE or the MetricFlow CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users will must be on a dbt Cloud Team or Enterprise plan.
+ +## Next steps + +- [About MetricFlow](/docs/build/about-metricflow) +- [Build your metrics](/docs/build/build-metrics-intro) +- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) diff --git a/website/docs/docs/build/snapshots.md b/website/docs/docs/build/snapshots.md index a204c0416c5..947d4ebef38 100644 --- a/website/docs/docs/build/snapshots.md +++ b/website/docs/docs/build/snapshots.md @@ -1,14 +1,15 @@ --- -title: "Snapshots" +title: "Add snapshots to your DAG" +sidebar_label: "Snapshots" +description: "Read this tutorial to learn how to use snapshots when building in dbt." id: "snapshots" --- ## Related documentation -* [Snapshot configurations](snapshot-configs) -* [Snapshot properties](snapshot-properties) -* [`snapshot` command](snapshot) +* [Snapshot configurations](/reference/snapshot-configs) +* [Snapshot properties](/reference/snapshot-properties) +* [`snapshot` command](/reference/commands/snapshot) -## Overview ### What are snapshots? Analysts often need to "look back in time" at previous data states in their mutable tables. While some source data systems are built in a way that makes accessing historical data possible, this is not always the case. dbt provides a mechanism, **snapshots**, which records changes to a mutable over time. @@ -34,7 +35,7 @@ This order is now in the "shipped" state, but we've lost the information about w In dbt, snapshots are `select` statements, defined within a snapshot block in a `.sql` file (typically in your `snapshots` directory). You'll also need to configure your snapshot to tell dbt how to detect record changes. - + ```sql {% snapshot orders_snapshot %} @@ -63,13 +64,13 @@ It is not possible to "preview data" or "compile sql" for snapshots in dbt Cloud ::: -When you run the [`dbt snapshot` command](snapshot): +When you run the [`dbt snapshot` command](/reference/commands/snapshot): * **On the first run:** dbt will create the initial snapshot table — this will be the result set of your `select` statement, with additional columns including `dbt_valid_from` and `dbt_valid_to`. All records will have a `dbt_valid_to = null`. * **On subsequent runs:** dbt will check which records have changed or if any new records have been created: - The `dbt_valid_to` column will be updated for any existing records that have changed - The updated record and any new records will be inserted into the snapshot table. These records will now have `dbt_valid_to = null` -Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](ref) function. +Snapshots can be referenced in downstream models the same way as referencing models — by using the [ref](/reference/dbt-jinja-functions/ref) function. ## Example @@ -78,7 +79,7 @@ To add a snapshot to your project: 1. Create a file in your `snapshots` directory with a `.sql` file extension, e.g. `snapshots/orders.sql` 2. Use a `snapshot` block to define the start and end of a snapshot: - + ```sql {% snapshot orders_snapshot %} @@ -90,7 +91,7 @@ To add a snapshot to your project: 3. Write a `select` statement within the snapshot block (tips for writing a good snapshot query are below). This select statement defines the results that you want to snapshot over time. You can use `sources` and `refs` here. - + ```sql {% snapshot orders_snapshot %} @@ -104,9 +105,9 @@ select * from {{ source('jaffle_shop', 'orders') }} 4. Check whether the result set of your query includes a reliable timestamp column that indicates when a record was last updated. For our example, the `updated_at` column reliably indicates record changes, so we can use the `timestamp` strategy. If your query result set does not have a reliable timestamp, you'll need to instead use the `check` strategy — more details on this below. -5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](snapshot-configs)). +5. Add configurations to your snapshot using a `config` block (more details below). You can also configure your snapshot from your `dbt_project.yml` file ([docs](/reference/snapshot-configs)). - + ```sql {% snapshot orders_snapshot %} @@ -129,7 +130,7 @@ select * from {{ source('jaffle_shop', 'orders') }} -6. Run the `dbt snapshot` [command](snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. +6. Run the `dbt snapshot` [command](/reference/commands/snapshot) — for our example a new table will be created at `analytics.snapshots.orders_snapshot`. You can change the `target_database` configuration, the `target_schema` configuration and the name of the snapshot (as defined in `{% snapshot .. %}`) will change how dbt names this table. ``` $ dbt snapshot @@ -178,7 +179,7 @@ The `timestamp` strategy requires the following configurations: **Example usage:** - + ```sql {% snapshot orders_snapshot_timestamp %} @@ -219,7 +220,7 @@ The `check` snapshot strategy can be configured to track changes to _all_ column **Example Usage** - + ```sql {% snapshot orders_snapshot_check %} @@ -243,17 +244,15 @@ The `check` snapshot strategy can be configured to track changes to _all_ column ### Hard deletes (opt-in) -New in v0.19.0 - Rows that are deleted from the source query are not invalidated by default. With the config option `invalidate_hard_deletes`, dbt can track rows that no longer exist. This is done by left joining the snapshot table with the source table, and filtering the rows that are still valid at that point, but no longer can be found in the source table. `dbt_valid_to` will be set to the current snapshot time. This configuration is not a different strategy as described above, but is an additional opt-in feature. It is not enabled by default since it alters the previous behavior. -For this configuration to work, the configured `updated_at` column must be of timestamp type. Otherwise, queries will fail due to mixing data types. +For this configuration to work with the `timestamp` strategy, the configured `updated_at` column must be of timestamp type. Otherwise, queries will fail due to mixing data types. **Example Usage** - + ```sql {% snapshot orders_snapshot_hard_delete %} @@ -282,24 +281,24 @@ There are a number of snapshot-specific configurations: | Config | Description | Required? | Example | | ------ | ----------- | --------- | ------- | -| [target_database](target_database) | The database that dbt should render the snapshot table into | No | analytics | -| [target_schema](target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | -| [strategy](strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | -| [unique_key](unique_key) | A column or expression for the record | Yes | id | -| [check_cols](check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | -| [updated_at](updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | -| [invalidate_hard_deletes](invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | +| [target_database](/reference/resource-configs/target_database) | The database that dbt should render the snapshot table into | No | analytics | +| [target_schema](/reference/resource-configs/target_schema) | The schema that dbt should render the snapshot table into | Yes | snapshots | +| [strategy](/reference/resource-configs/strategy) | The snapshot strategy to use. One of `timestamp` or `check` | Yes | timestamp | +| [unique_key](/reference/resource-configs/unique_key) | A column or expression for the record | Yes | id | +| [check_cols](/reference/resource-configs/check_cols) | If using the `check` strategy, then the columns to check | Only if using the `check` strategy | ["status"] | +| [updated_at](/reference/resource-configs/updated_at) | If using the `timestamp` strategy, the timestamp column to compare | Only if using the `timestamp` strategy | updated_at | +| [invalidate_hard_deletes](/reference/resource-configs/invalidate_hard_deletes) | Find hard deleted records in source, and set `dbt_valid_to` current time if no longer exists | No | True | -A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](snapshot-configs). +A number of other configurations are also supported (e.g. `tags` and `post-hook`), check out the full list [here](/reference/snapshot-configs). -Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](snapshot-configs) for more information. +Snapshots can be configured from both your `dbt_project.yml` file and a `config` block, check out the [configuration docs](/reference/snapshot-configs) for more information. -Note: As of v0.21, BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. +Note: BigQuery users can use `target_project` and `target_dataset` as aliases for `target_database` and `target_schema`, respectively. ### Configuration best practices #### Use the `timestamp` strategy where possible -This strategy handles column additions and deletions better than the `check_cols` strategy. +This strategy handles column additions and deletions better than the `check` strategy. #### Ensure your unique key is really unique The unique key is used by dbt to match rows up, so it's extremely important to make sure this key is actually unique! If you're snapshotting a source, I'd recommend adding a uniqueness test to your source ([example](https://github.com/dbt-labs/jaffle_shop/blob/8e7c853c858018180bef1756ec93e193d9958c5b/models/staging/schema.yml#L26)). @@ -346,7 +345,7 @@ For the `timestamp` strategy, the configured `updated_at` column is used to popu
Details for the timestamp strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2019-01-01 11:00` | id | status | updated_at | | -- | ------- | ---------------- | @@ -368,19 +367,19 @@ Snapshot results (note that `11:30` is not used anywhere): | id | status | updated_at | dbt_valid_from | dbt_valid_to | dbt_updated_at | | -- | ------- | ---------------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | 2019-01-01 11:05 | 2019-01-01 11:05 | +| 1 | pending | 2019-01-01 10:47 | 2019-01-01 10:47 | 2019-01-01 11:05 | 2019-01-01 10:47 | | 1 | shipped | 2019-01-01 11:05 | 2019-01-01 11:05 | | 2019-01-01 11:05 |

-For the `check` strategy, the current timestamp is used to populate each column +For the `check` strategy, the current timestamp is used to populate each column. If configured, the `check` strategy uses the `updated_at` column instead, as with the timestamp strategy.
Details for the check strategy -Snapshot query results at `2019-01-01 11:00` +Snapshot query results at `2019-01-01 11:00` | id | status | | -- | ------- | @@ -402,16 +401,17 @@ Snapshot results: | id | status | dbt_valid_from | dbt_valid_to | dbt_updated_at | | --- | ------- | ---------------- | ---------------- | ---------------- | -| 1 | pending | 2019-01-01 11:00 | 2019-01-01 11:30 | 2019-01-01 11:30 | +| 1 | pending | 2019-01-01 11:00 | 2019-01-01 11:30 | 2019-01-01 11:00 | | 1 | shipped | 2019-01-01 11:30 | | 2019-01-01 11:30 |
## FAQs - - - - - - \ No newline at end of file + + + + + + + diff --git a/website/docs/docs/build/sources.md b/website/docs/docs/build/sources.md index e5802ada6db..a657b6257c9 100644 --- a/website/docs/docs/build/sources.md +++ b/website/docs/docs/build/sources.md @@ -1,14 +1,16 @@ --- -title: "Sources" +title: "Add sources to your DAG" +sidebar_label: "Sources" +description: "Read this tutorial to learn how to use sources when building in dbt." id: "sources" search_weight: "heavy" --- ## Related reference docs -* [Source properties](source-properties) -* [Source configurations](source-configs) +* [Source properties](/reference/source-properties) +* [Source configurations](/reference/source-configs) * [`{{ source() }}` jinja function](/reference/dbt-jinja-functions/source) -* [`source freshness` command](commands/source) +* [`source freshness` command](/reference/commands/source) ## Using sources Sources make it possible to name and describe the data loaded into your warehouse by your Extract and Load tools. By declaring these tables as sources in dbt, you can then @@ -27,6 +29,8 @@ version: 2 sources: - name: jaffle_shop + database: raw + schema: jaffle_shop tables: - name: orders - name: customers @@ -38,7 +42,9 @@ sources:
-If you're not already familiar with these files, be sure to check out [the documentation on schema.yml files](configs-and-properties) before proceeding. +*By default, `schema` will be the same as `name`. Add `schema` only if you want to use a source name that differs from the existing schema. + +If you're not already familiar with these files, be sure to check out [the documentation on schema.yml files](/reference/configs-and-properties) before proceeding. ### Selecting from a source @@ -85,7 +91,7 @@ You can also: - Add tests to sources - Add descriptions to sources, that get rendered as part of your documentation site -These should be familiar concepts if you've already added tests and descriptions to your models (if not check out the guides on [testing](/docs/build/tests) and [documentation](documentation)). +These should be familiar concepts if you've already added tests and descriptions to your models (if not check out the guides on [testing](/docs/build/tests) and [documentation](/docs/collaborate/documentation)). @@ -115,14 +121,14 @@ sources: -You can find more details on the available properties for sources in the [reference section](source-properties). +You can find more details on the available properties for sources in the [reference section](/reference/source-properties). ### FAQs - - - - - + + + + + ## Snapshotting source data freshness With a couple of extra configs, dbt can optionally snapshot the "freshness" of the data in your source tables. This is useful for understanding if your data pipelines are in a healthy state, and is a critical component of defining SLAs for your warehouse. @@ -162,16 +168,16 @@ In the `freshness` block, one or both of `warn_after` and `error_after` can be p Additionally, the `loaded_at_field` is required to calculate freshness for a table. If a `loaded_at_field` is not provided, then dbt will not calculate freshness for the table. -These configs are applied hierarchically, so `freshness` and `loaded_at` field values specified for a `source` will flow through to all of the `tables` defined in that source. This is useful when all of the tables in a source have the same `loaded_at_field`, as the config can just be specified once in the top-level source definition. +These configs are applied hierarchically, so `freshness` and `loaded_at_field` values specified for a `source` will flow through to all of the `tables` defined in that source. This is useful when all of the tables in a source have the same `loaded_at_field`, as the config can just be specified once in the top-level source definition. ### Checking source freshness -To snapshot freshness information for your sources, use the `dbt source freshness` command ([reference docs](commands/source)): +To snapshot freshness information for your sources, use the `dbt source freshness` command ([reference docs](/reference/commands/source)): ``` $ dbt source freshness ``` -Behind the scenes, dbt uses the freshness properties to construct a `select` query, shown below. You can find this query in the logs. +Behind the scenes, dbt uses the freshness properties to construct a `select` query, shown below. You can find this query in the [query logs](/faqs/runs/checking-logs). ```sql select @@ -185,8 +191,19 @@ The results of this query are used to determine whether the source is fresh or n +### Filter + +Some databases can have tables where a filter over certain columns are required, in order prevent a full scan of the table, which could be costly. In order to do a freshness check on such tables a `filter` argument can be added to the configuration, e.g. `filter: _etl_loaded_at >= date_sub(current_date(), interval 1 day)`. For the example above, the resulting query would look like + +```sql +select + max(_etl_loaded_at) as max_loaded_at, + convert_timezone('UTC', current_timestamp()) as snapshotted_at +from raw.jaffle_shop.orders +where _etl_loaded_at >= date_sub(current_date(), interval 1 day) +``` ### FAQs - - - + + + diff --git a/website/docs/docs/build/sql-models.md b/website/docs/docs/build/sql-models.md index d4c8b3de23c..65fdd58adf0 100644 --- a/website/docs/docs/build/sql-models.md +++ b/website/docs/docs/build/sql-models.md @@ -1,19 +1,20 @@ --- title: "SQL models" +description: "Read this tutorial to learn how to use SQL models when building in dbt." id: "sql-models" --- ## Related reference docs -* [Model configurations](model-configs) -* [Model properties](model-properties) -* [`run` command](run) -* [`ref` function](ref) +* [Model configurations](/reference/model-configs) +* [Model properties](/reference/model-properties) +* [`run` command](/reference/commands/run) +* [`ref` function](/reference/dbt-jinja-functions/ref) ## Getting started :::info Building your first models -If you're new to dbt, we recommend that you read the [Getting Started guide](/docs/get-started/getting-started/overview) to build your first dbt project with models. +If you're new to dbt, we recommend that you read a [quickstart guide](/quickstarts) to build your first dbt project with models. ::: @@ -21,7 +22,7 @@ If you're new to dbt, we recommend that you read the [Getting Started guide](/do Starting in v1.3, dbt Core adds support for **Python models**. -dbt's Python capabilities are an extension of its capabilities with SQL models. If you're new to dbt, we recommend that you read this page first, before reading: ["Python Models"](python-models) +dbt's Python capabilities are an extension of its capabilities with SQL models. If you're new to dbt, we recommend that you read this page first, before reading: ["Python Models"](/docs/building-a-dbt-project/building-models/python-models) @@ -30,7 +31,7 @@ A SQL model is a `select` statement. Models are defined in `.sql` files (typical - The model name is inherited from the filename. - Models can be nested in subdirectories within the `models` directory -When you execute the [`dbt run` command](run), dbt will build this model by wrapping it in a `create view as` or `create table as` statement. +When you execute the [`dbt run` command](/reference/commands/run), dbt will build this model by wrapping it in a `create view as` or `create table as` statement. For example, consider this `customers` model: @@ -102,18 +103,18 @@ Why a _view_ named `dbt_alice.customers`? By default dbt will: You can use _configurations_ to change any of these behaviors — more on that later. ### FAQs - - - - - + + + + + ## Configuring models Configurations are "model settings" that can be set in your `dbt_project.yml` file, _and_ in your model file using a `config` block. Some example configurations include: -* Changing the that a model uses — a [materialization](materializations) determines the SQL that dbt uses to create the model in your warehouse. +* Changing the that a model uses — a [materialization](/docs/build/materializations) determines the SQL that dbt uses to create the model in your warehouse. * Build models into separate [schemas](/docs/build/custom-schemas). -* Apply [tags](resource-configs/tags) to a model. +* Apply [tags](/reference/resource-configs/tags) to a model. Here's an example of model configuration: @@ -154,15 +155,15 @@ with customer_orders as ... It is important to note that configurations are applied hierarchically — a configuration applied to a subdirectory will override any general configurations. -You can learn more about configurations in the [reference docs](model-configs). +You can learn more about configurations in the [reference docs](/reference/model-configs). ### FAQs - - + + ## Building dependencies between models -You can build dependencies between models by using the [`ref` function](ref) in place of table names in a query. Use the name of another model as the argument for `ref`. +You can build dependencies between models by using the [`ref` function](/reference/dbt-jinja-functions/ref) in place of table names in a query. Use the name of another model as the argument for `ref`. - - - - - - - - + + + + + + + + + diff --git a/website/docs/docs/build/tests.md b/website/docs/docs/build/tests.md index d9bdf3435bd..75ee5992a76 100644 --- a/website/docs/docs/build/tests.md +++ b/website/docs/docs/build/tests.md @@ -1,13 +1,17 @@ --- -title: "Tests" +title: "Add tests to your DAG" +sidebar_label: "Tests" +description: "Read this tutorial to learn how to use tests when building in dbt." +search_weight: "heavy" id: "tests" +keywords: + - test, tests, testing, dag --- - ## Related reference docs -* [Test command](commands/test) -* [Test properties](resource-properties/tests) -* [Test configurations](test-configs) -* [Test selection examples](test-selection-examples) +* [Test command](/reference/commands/test) +* [Test properties](/reference/resource-properties/tests) +* [Test configurations](/reference/test-configs) +* [Test selection examples](/reference/node-selection/test-selection-examples) ## Overview @@ -15,29 +19,25 @@ Tests are assertions you make about your models and other resources in your dbt You can use tests to improve the integrity of the SQL in each model by making assertions about the results generated. Out of the box, you can test whether a specified column in a model only contains non-null values, unique values, or values that have a corresponding value in another model (for example, a `customer_id` for an `order` corresponds to an `id` in the `customers` model), and values from a specified list. You can extend tests to suit business logic specific to your organization – any assertion that you can make about your model in the form of a select query can be turned into a test. - - -* `v0.20.0`: Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. - - +Both types of tests return a set of failing records. Previously, generic/schema tests returned a numeric value representing failures. Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. Like almost everything in dbt, tests are SQL queries. In particular, they are `select` statements that seek to grab "failing" records, ones that disprove your assertion. If you assert that a column is unique in a model, the test query selects for duplicates; if you assert that a column is never null, the test seeks after nulls. If the test returns zero failing rows, it passes, and your assertion has been validated. There are two ways of defining tests in dbt: -* A **singular** test is testing in its simplest form: If you can write a SQL query that returns failing rows, you can save that query in a `.sql` file within your [test directory](test-paths). It's now a test, and it will be executed by the `dbt test` command. -* A **generic** test is a parametrized query that accepts arguments. The test query is defined in a special `test` block (like a [macro](jinja-macros)). Once defined, you can reference the generic test by name throughout your `.yml` files—define it on models, columns, sources, snapshots, and seeds. dbt ships with four generic tests built in, and we think you should use them! +* A **singular** test is testing in its simplest form: If you can write a SQL query that returns failing rows, you can save that query in a `.sql` file within your [test directory](/reference/project-configs/test-paths). It's now a test, and it will be executed by the `dbt test` command. +* A **generic** test is a parameterized query that accepts arguments. The test query is defined in a special `test` block (like a [macro](jinja-macros)). Once defined, you can reference the generic test by name throughout your `.yml` files—define it on models, columns, sources, snapshots, and seeds. dbt ships with four generic tests built in, and we think you should use them! Defining tests is a great way to confirm that your code is working correctly, and helps prevent regressions when your code changes. Because you can use them over and over again, making similar assertions with minor variations, generic tests tend to be much more common—they should make up the bulk of your dbt testing suite. That said, both ways of defining tests have their time and place. :::tip Creating your first tests -If you're new to dbt, we recommend that you check out our [Getting Started guide](/docs/get-started/getting-started/overview) to build your first dbt project with models and tests. +If you're new to dbt, we recommend that you check out our [quickstart guide](/quickstarts) to build your first dbt project with models and tests. ::: ## Singular tests The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose. -These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your [`test-paths` config](test-paths)). You can use Jinja (including `ref` and `source`) in the test definition, just like you can when creating models. Each `.sql` file contains one `select` statement, and it defines one test: +These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your [`test-paths` config](/reference/project-configs/test-paths)). You can use Jinja (including `ref` and `source`) in the test definition, just like you can when creating models. Each `.sql` file contains one `select` statement, and it defines one test: @@ -74,7 +74,7 @@ Certain tests are generic: they can be reused over and over again. A generic tes You'll notice that there are two arguments, `model` and `column_name`, which are then templated into the query. This is what makes the test "generic": it can be defined on as many columns as you like, across as many models as you like, and dbt will pass the values of `model` and `column_name` accordingly. Once that generic test has been defined, it can be added as a _property_ on any existing model (or source, seed, or snapshot). These properties are added in `.yml` files in the same directory as your resource. :::info -If this is your first time working with adding properties to a resource, check out the docs on [declaring properties](configs-and-properties). +If this is your first time working with adding properties to a resource, check out the docs on [declaring properties](/reference/configs-and-properties). ::: Out of the box, dbt ships with four generic tests already defined: `unique`, `not_null`, `accepted_values` and `relationships`. Here's a full example using those tests on an `orders` model: @@ -108,11 +108,11 @@ In plain English, these tests translate to: Behind the scenes, dbt constructs a `select` query for each test, using the parametrized query from the generic test block. These queries return the rows where your assertion is _not_ true; if the test returns zero rows, your assertion passes. -You can find more information about these tests, and additional configurations (including [`severity`](severity) and [`tags`](resource-configs/tags)) in the [reference section](resource-properties/tests). +You can find more information about these tests, and additional configurations (including [`severity`](/reference/resource-configs/severity) and [`tags`](/reference/resource-configs/tags)) in the [reference section](/reference/resource-properties/tests). ### More generic tests -Those four tests are enough to get you started. You'll quickly find you want to use a wider variety of tests—a good thing! You can also install generic tests from a package, or write your own, to use (and reuse) across your dbt project. Check out the [guide on custom generic tests](custom-generic-tests) for more information. +Those four tests are enough to get you started. You'll quickly find you want to use a wider variety of tests—a good thing! You can also install generic tests from a package, or write your own, to use (and reuse) across your dbt project. Check out the [guide on custom generic tests](/guides/best-practices/writing-custom-generic-tests) for more information. :::info There are generic tests defined in some open source packages, such as [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and [dbt-expectations](https://hub.getdbt.com/calogica/dbt_expectations/latest/) — skip ahead to the docs on [packages](/docs/build/packages) to learn more! @@ -140,7 +140,7 @@ models: -2. Run the [`dbt test` command](commands/test): +2. Run the [`dbt test` command](/reference/commands/test): ``` $ dbt test @@ -241,13 +241,7 @@ where {{ column_name }} is null ## Storing test failures - - -* `v0.20.0`: Introduced storing test failures in the database - - - -Normally, a test query will calculate failures as part of its execution. If you set the optional `--store-failures` flag or [`store_failures` config](resource-configs/store_failures), dbt will first save the results of a test query to a table in the database, and then query that table to calculate the number of failures. +Normally, a test query will calculate failures as part of its execution. If you set the optional `--store-failures` flag, the [`store_failures`](/reference/resource-configs/store_failures), or the [`store_failures_as`](/reference/resource-configs/store_failures_as) configs, dbt will first save the results of a test query to a table in the database, and then query that table to calculate the number of failures. This workflow allows you to query and examine failing records much more quickly in development: @@ -259,11 +253,11 @@ Note that, if you elect to store test failures: ## FAQs - - - - - - - - + + + + + + + + diff --git a/website/docs/docs/build/validation.md b/website/docs/docs/build/validation.md new file mode 100644 index 00000000000..02ce48729a4 --- /dev/null +++ b/website/docs/docs/build/validation.md @@ -0,0 +1,53 @@ +--- +title: Validations +id: validation +description: "The Semantic Layer, powered by MetricFlow, has three types of built-in validations, including Parsing Validation, Semantic Validation, and Data Warehouse validation, which are performed in a sequential and blocking manner." +sidebar_label: "Validations" +tags: [Metrics, Semantic Layer] +--- + +Validations refer to the process of checking whether a system or configuration meets the expected requirements or constraints. In the case of the Semantic Layer, powered by MetricFlow, there are three built-in validations — [parsing](#parsing), [semantic](#semantic), and [data platform](#data-platform). + +These validations ensure that configuration files follow the expected schema, the semantic graph doesn't violate any constraints, and semantic definitions in the graph exist in the physical table - providing effective data governance support. These three validation steps occur sequentially and must succeed before proceeding to the next step. + +The code that handles validation [can be found here](https://github.com/dbt-labs/dbt-semantic-interfaces/tree/main/dbt_semantic_interfaces/validations) for those who want to dive deeper into this topic. + + +## Validations command + +You can run validations from dbt Cloud or the command line with the following [MetricFlow commands](/docs/build/metricflow-commands): + +```bash +dbt sl validate-configs # dbt Cloud users +mf validate-configs # dbt Core users +``` + +## Parsing + +In this validation step, we ensure your config files follow the defined schema for each semantic graph object and can be parsed successfully. It validates the schema for the following core objects: + +* Semantic models +* Identifiers +* Measures +* Dimensions +* Metrics + +## Semantic + +This validation step occurs after we've built your semantic graph. The Semantic Layer, powered by MetricFlow, runs a suite of tests to ensure that your semantic graph doesn't violate any constraints. For example, we check to see if measure names are unique, or if metrics referenced in materialization exist. The current semantic rules we check for are: + +1. Check those semantic models with measures have a valid time dimension +2. Check that there is only one primary identifier defined in each semantic model +3. Dimension consistency +4. Unique measures in semantic models +5. Measures in metrics are valid +7. Cumulative metrics are configured properly + +## Data platform + +This type of validation Checks to see if the semantic definitions in your semantic graph exist in the underlying physical table. To test this, we run queries against your data platform to ensure the generated SQL for semantic models, dimensions, and metrics will execute. We run the following checks + +* Check that measures and dimensions exist +* Check that underlying tables for data sources exist +* Check that the generated SQL for metrics will execute + diff --git a/website/docs/docs/building-a-dbt-project/building-models/python-models.md b/website/docs/docs/building-a-dbt-project/building-models/python-models.md deleted file mode 100644 index 4c25da2a10d..00000000000 --- a/website/docs/docs/building-a-dbt-project/building-models/python-models.md +++ /dev/null @@ -1,713 +0,0 @@ ---- -title: "Python models" ---- - -:::info Brand new! - -dbt Core v1.3 included first-ever support for Python models. Note that only [specific data platforms](#specific-data-platforms) support dbt-py models. - -We encourage you to: -- Read [the original discussion](https://github.com/dbt-labs/dbt-core/discussions/5261) that proposed this feature. -- Contribute to [best practices for developing Python models in dbt](https://discourse.getdbt.com/t/dbt-python-model-dbt-py-best-practices/5204 ). -- Weigh in on [next steps for Python models, beyond v1.3](https://github.com/dbt-labs/dbt-core/discussions/5742). -- Join the **#dbt-core-python-models** channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). - -Below, you'll see sections entitled "❓ **Our questions**." We are excited to have released a first narrow set of functionality in v1.3, which will solve real use cases. We also know this is a first step into a much wider field of possibility. We don't pretend to have all the answers. We're excited to keep developing our opinionated recommendations and next steps for product development—and we want your help. Comment in the GitHub discussions; leave thoughts in Slack; bring up dbt + Python in casual conversation with colleagues and friends. -::: - -## About Python models in dbt - -dbt Python ("dbt-py") models will help you solve use cases that can't be solved with SQL. You can perform analyses using tools available in the open source Python ecosystem, including state-of-the-art packages for data science and statistics. Before, you would have needed separate infrastructure and orchestration to run Python transformations in production. By defining your Python transformations in dbt, they're just models in your project, with all the same capabilities around testing, documentation, and lineage. - - - -Python models are supported in dbt Core 1.3 and above. Learn more about [upgrading your version in dbt Cloud](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions) and [upgrading dbt Core versions](https://docs.getdbt.com/docs/core-versions#upgrading-to-new-patch-versions). - -To read more about Python models, change the docs version to 1.3 or higher in the menu above. - - - - - - - - -```python -import ... - -def model(dbt, session): - - my_sql_model_df = dbt.ref("my_sql_model") - - final_df = ... # stuff you can't write in SQL! - - return final_df -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - - # Document within the same codebase - description: My transformation written in Python - - # Configure in ways that feel intuitive and familiar - config: - materialized: table - tags: ['python'] - - # Test the results of my Python transformation - columns: - - name: id - # Standard validation for 'grain' of Python results - tests: - - unique - - not_null - tests: - # Write your own validation logic (in SQL) for Python results - - [custom_generic_test](writing-custom-generic-tests) -``` - - - - - - -The prerequisites for dbt Python models include using an adapter for a data platform that supports a fully featured Python runtime. In a dbt Python model, all Python code is executed remotely on the platform. None of it is run by dbt locally. We believe in clearly separating _model definition_ from _model execution_. In this and many other ways, you'll find that dbt's approach to Python models mirrors its longstanding approach to modeling data in SQL. - -We've written this guide assuming that you have some familiarity with dbt. If you've never before written a dbt model, we encourage you to start by first reading [dbt Models](/docs/build/models). Throughout, we'll be drawing connections between Python models and SQL models, as well as making clear their differences. - -### What is a Python model? - -A dbt Python model is a function that reads in dbt sources or other models, applies a series of transformations, and returns a transformed dataset. DataFrame operations define the starting points, the end state, and each step along the way. - -This is similar to the role of CTEs in dbt SQL models. We use CTEs to pull in upstream datasets, define (and name) a series of meaningful transformations, and end with a final `select` statement. You can run the compiled version of a dbt SQL model to see the data included in the resulting view or table. When you `dbt run`, dbt wraps that query in `create view`, `create table`, or more complex DDL to save its results in the database. - -Instead of a final `select` statement, each Python model returns a final DataFrame. Each DataFrame operation is "lazily evaluated." In development, you can preview its data, using methods like `.show()` or `.head()`. When you run a Python model, the full result of the final DataFrame will be saved as a table in your data warehouse. - -dbt Python models have access to almost all of the same configuration options as SQL models. You can test them, document them, add `tags` and `meta` properties to them, grant access to their results to other users, and so on. You can select them by their name, their file path, their configurations, whether they are upstream or downstream of another model, or whether they have been modified compared to a previous project state. - -### Defining a Python model - -Each Python model lives in a `.py` file in your `models/` folder. It defines a function named **`model()`**, which takes two parameters: -- **`dbt`**: A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. -- **`session`**: A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames, and to write DataFrames back to tables. In PySpark, by convention, the `SparkSession` is named `spark`, and available globally. For consistency across platforms, we always pass it into the `model` function as an explicit argument called `session`. - -The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. Via PySpark (Databricks + BigQuery), this can be a Spark, pandas, or pandas-on-Spark DataFrame. For more about choosing between pandas and native DataFrames, see [DataFrame API + syntax](#dataframe-api--syntax). - -When you `dbt run --select python_model`, dbt will prepare and pass in both arguments (`dbt` and `session`). All you have to do is define the function. This is how every single Python model should look: - - - -```python -def model(dbt, session): - - ... - - return final_df -``` - - - - -### Referencing other models - -Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. Use the `dbt.ref()` method within a Python model to read in data from other models (SQL or Python). If you want to read directly from a raw source table, use `dbt.source()`. These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. - - - -```python -def model(dbt, session): - - # DataFrame representing an upstream model - upstream_model = dbt.ref("upstream_model_name") - - # DataFrame representing an upstream source - upstream_source = dbt.source("upstream_source_name", "table_name") - - ... -``` - - - -Of course, you can `ref()` your Python model in downstream SQL models, too: - - - -```sql -with upstream_python_model as ( - - select * from {{ ref('my_python_model') }} - -), - -... -``` - - - -### Configuring Python models - -Just like SQL models, there are three ways to configure Python models: -1. In `dbt_project.yml`, where you can configure many models at once -2. In a dedicated `.yml` file, within the `models/` directory -3. Within the model's `.py` file, using the `dbt.config()` method - -Calling the `dbt.config()` method will set configurations for your model right within your `.py` file, similar to the `{{ config() }}` macro in `.sql` model files: - - - -```python -def model(dbt, session): - - # setting configuration - dbt.config(materialized="table") -``` - - - -There's a limit to how fancy you can get with the `dbt.config()` method. It accepts _only_ literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the [`config` property](resource-properties/config) in a yaml file. - -#### Accessing project context - -dbt Python models don't use Jinja to render compiled code. Python models have limited access to global project contexts compared to SQL models. That context is made available from the `dbt` class, passed in as an argument to the `model()` function. - -Out of the box, the `dbt` class supports: -- Returning DataFrames referencing the locations of other resources: `dbt.ref()` + `dbt.source()` -- Accessing the database location of the current model: `dbt.this()` (also: `dbt.this.database`, `.schema`, `.identifier`) -- Determining if the current model's run is incremental: `dbt.is_incremental` - -It is possible to extend this context by "getting" them via `dbt.config.get()` after they are configured in the [model's config](/reference/model-configs). This includes inputs such as `var`, `env_var`, and `target`. If you want to use those values to power conditional logic in your model, we require setting them through a dedicated `.yml` file config: - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - materialized: table - target_name: "{{ target.name }}" - specific_var: "{{ var('SPECIFIC_VAR') }}" - specific_env_var: "{{ env_var('SPECIFIC_ENV_VAR') }}" -``` - - - -Then, within the model's Python code, use the `dbt.config.get()` function to _access_ values of configurations that have been set: - - - -```python -def model(dbt, session): - target_name = dbt.config.get("target_name") - specific_var = dbt.config.get("specific_var") - specific_env_var = dbt.config.get("specific_env_var") - - orders_df = dbt.ref("fct_orders") - - # limit data in dev - if target_name == "dev": - orders_df = orders_df.limit(500) -``` - - - -### Materializations - -Python models support two materializations: -- `table` -- `incremental` - -Incremental Python models support all the same [incremental strategies](/docs/build/incremental-models#about-incremental_strategy) as their SQL counterparts. The specific strategies supported depend on your adapter. - -Python models can't be materialized as `view` or `ephemeral`. Python isn't supported for non-model resource types (like tests and snapshots). - -For incremental models, like SQL models, you will need to filter incoming tables to only new rows of data: - - - -
- - - -```python -import snowflake.snowpark.functions as F - -def model(dbt, session): - dbt.config(materialized = "incremental") - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at >= session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.dateadd("day", F.lit(-3), F.current_timestamp())) - - ... - - return df -``` - - - -
- -
- - - -```python -import pyspark.sql.functions as F - -def model(dbt, session): - dbt.config(materialized = "incremental") - df = dbt.ref("upstream_table") - - if dbt.is_incremental: - - # only new rows compared to max in current table - max_from_this = f"select max(updated_at) from {dbt.this}" - df = df.filter(df.updated_at >= session.sql(max_from_this).collect()[0][0]) - - # or only rows from the past 3 days - df = df.filter(df.updated_at >= F.date_add(F.current_timestamp(), F.lit(-3))) - - ... - - return df -``` - - - -
- -
- -**Note:** Incremental models are supported on BigQuery/Dataproc for the `merge` incremental strategy. The `insert_overwrite` strategy is not yet supported. - -## Python-specific functionality - -### Defining functions - -In addition to defining a `model` function, the Python model can import other functions or define its own. Here's an example, on Snowpark, defining a custom `add_one` function: - - - -```python -def add_one(x): - return x + 1 - -def model(dbt, session): - dbt.config(materialized="table") - temps_df = dbt.ref("temperatures") - - # warm things up just a little - df = temps_df.withColumn("degree_plus_one", add_one(temps_df["degree"])) - return df -``` - - - -At present, Python functions defined in one dbt model can't be imported and reused in other models. See the ["Code reuse"](#code-reuse) section for the potential patterns we're considering. - -### Using PyPI packages - -You can also define functions that depend on third-party packages, so long as those packages are installed and available to the Python runtime on your data platform. See notes on "Installing Packages" for [specific data warehouses](#specific-data-warehouses). - -In this example, we use the `holidays` package to determine if a given date is a holiday in France. For simplicity and consistency across platforms, the code below uses the pandas API. The exact syntax, and the need to refactor for multi-node processing, still varies. - - - -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas() - - # apply our function - # (columns need to be in uppercase on Snowpark) - df["IS_HOLIDAY"] = df["ORDER_DATE"].apply(is_holiday) - - # return final dataset (Pandas DataFrame) - return df -``` - - - -
- -
- - - -```python -import holidays - -def is_holiday(date_col): - # Chez Jaffle - french_holidays = holidays.France() - is_holiday = (date_col in french_holidays) - return is_holiday - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["holidays"] - ) - - orders_df = dbt.ref("stg_orders") - - df = orders_df.to_pandas_on_spark() # Spark 3.2+ - # df = orders_df.toPandas() in earlier versions - - # apply our function - df["is_holiday"] = df["order_date"].apply(is_holiday) - - # convert back to PySpark - df = df.to_spark() # Spark 3.2+ - # df = session.createDataFrame(df) in earlier versions - - # return final dataset (PySpark DataFrame) - return df -``` - - - -
- -
- -#### Configuring packages - -We encourage you to explicitly configure required packages and versions so dbt can track them in project metadata. This configuration is required for the implementation on some platforms. If you need specific versions of packages, specify them. - - - -```python -def model(dbt, session): - dbt.config( - packages = ["numpy==1.23.1", "scikit-learn"] - ) -``` - - - - - -```yml -version: 2 - -models: - - name: my_python_model - config: - packages: - - "numpy==1.23.1" - - scikit-learn -``` - - - -#### UDFs - -You can use the `@udf` decorator or `udf` function to define an "anonymous" function and call it within your `model` function's DataFrame transformation. This is a typical pattern for applying more complex functions as DataFrame operations, especially if those functions require inputs from third-party packages. -- [Snowpark Python: Creating UDFs](https://docs.snowflake.com/en/developer-guide/snowpark/python/creating-udfs.html) -- [PySpark functions: udf](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html) - - - -
- - - -```python -import snowflake.snowpark.types as T -import snowflake.snowpark.functions as F -import numpy - -def register_udf_add_random(): - add_random = F.udf( - # use 'lambda' syntax, for simple functional behavior - lambda x: x + numpy.random.normal(), - return_type=T.FloatType(), - input_types=[T.FloatType()] - ) - return add_random - -def model(dbt, session): - - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - add_random = register_udf_add_random() - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -**Note:** Due to a Snowpark limitation, it is not currently possible to register complex named UDFs within stored procedures, and therefore dbt Python models. We are looking to add native support for Python UDFs as a project/DAG resource type in a future release. For the time being, if you want to create a "vectorized" Python UDF via the Batch API, we recommend either: -- Writing [`create function`](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html) inside a SQL macro, to run as a hook or run-operation -- [Registering from a staged file](https://docs.snowflake.com/ko/developer-guide/snowpark/reference/python/_autosummary/snowflake.snowpark.udf.html#snowflake.snowpark.udf.UDFRegistration.register_from_file) within your Python model code - -
- -
- - - -```python -from pyspark.sql.types as T -import pyspark.sql.functions as F -import numpy - -# use a 'decorator' for more readable code -@F.udf(returnType=T.DoubleType()) -def add_random(x): - random_number = numpy.random.normal() - return x + random_number - -def model(dbt, session): - dbt.config( - materialized = "table", - packages = ["numpy"] - ) - - temps_df = dbt.ref("temperatures") - - # warm things up, who knows by how much - df = temps_df.withColumn("degree_plus_random", add_random("degree")) - return df -``` - - - -
- -
- -#### Code reuse - -Currently, you cannot import or reuse Python functions defined in one dbt model, in other models. This is something we'd like dbt to support. There are two patterns we're considering: -1. Creating and registering **"named" UDFs**. This process is different across data platforms and has some performance limitations. (Snowpark does support ["vectorized" UDFs](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-batch.html): pandas-like functions that you can execute in parallel.) -2. Using **private Python packages**. In addition to importing reusable functions from public PyPI packages, many data platforms support uploading custom Python assets and registering them as packages. The upload process looks different across platforms, but your code’s actual `import` looks the same. - -:::note ❓ Our questions - -- Should dbt have a role in abstracting over UDFs? Should dbt support a new type of DAG node, `function`? Would the primary use case be code reuse across Python models or defining Python-language functions that can be called from SQL models? -- How can dbt help users when uploading or initializing private Python assets? Is this a new form of `dbt deps`? -- How can dbt support users who want to test custom functions? If defined as UDFs: "unit testing" in the database? If "pure" functions in packages: encourage adoption of `pytest`? - -💬 Discussion: ["Python models: package, artifact/object storage, and UDF management in dbt"](https://github.com/dbt-labs/dbt-core/discussions/5741) -::: - -### DataFrame API and syntax - -Over the past decade, most people writing data transformations in Python have adopted DataFrame as their common abstraction. dbt follows this convention by returning `ref()` and `source()` as DataFrames, and it expects all Python models to return a DataFrame. - -A DataFrame is a two-dimensional data structure (rows and columns). It supports convenient methods for transforming that data, creating new columns from calculations performed on existing columns. It also offers convenient ways for previewing data while developing locally or in a notebook. - -That's about where the agreement ends. There are numerous frameworks with their own syntaxes and APIs for DataFrames. The [pandas](https://pandas.pydata.org/docs/) library offered one of the original DataFrame APIs, and its syntax is the most common to learn for new data professionals. Most newer DataFrame APIs are compatible with pandas-style syntax, though few can offer perfect interoperability. This is true for Snowpark and PySpark, which have their own DataFrame APIs. - -When developing a Python model, you will find yourself asking these questions: - -**Why pandas?** It's the most common API for DataFrames. It makes it easy to explore sampled data and develop transformations locally. You can “promote” your code as-is into dbt models and run it in production for small datasets. - -**Why _not_ pandas?** Performance. pandas runs "single-node" transformations, which cannot benefit from the parallelism and distributed computing offered by modern data warehouses. This quickly becomes a problem as you operate on larger datasets. Some data platforms support optimizations for code written using pandas' DataFrame API, preventing the need for major refactors. For example, ["pandas on PySpark"](https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html) offers support for 95% of pandas functionality, using the same API while still leveraging parallel processing. - -:::note ❓ Our questions -- When developing a new dbt Python model, should we recommend pandas-style syntax for rapid iteration and then refactor? -- Which open source libraries provide compelling abstractions across different data engines and vendor-specific APIs? -- Should dbt attempt to play a longer-term role in standardizing across them? - -💬 Discussion: ["Python models: the pandas problem (and a possible solution)"](https://github.com/dbt-labs/dbt-core/discussions/5738) -::: - -### Limitations - -Python models have capabilities that SQL models do not. They also have some drawbacks compared to SQL models: - -- **Time and cost.** Python models are slower to run than SQL models, and the cloud resources that run them can be more expensive. Running Python requires more general-purpose compute. That compute might sometimes live on a separate service or architecture from your SQL models. **However:** We believe that deploying Python models via dbt—with unified lineage, testing, and documentation—is, from a human standpoint, **dramatically** faster and cheaper. By comparison, spinning up separate infrastructure to orchestrate Python transformations in production and different tooling to integrate with dbt is much more time-consuming and expensive. -- **Syntax differences** are even more pronounced. Over the years, dbt has done a lot, via dispatch patterns and packages such as `dbt_utils`, to abstract over differences in SQL dialects across popular data warehouses. Python offers a **much** wider field of play. If there are five ways to do something in SQL, there are 500 ways to write it in Python, all with varying performance and adherence to standards. Those options can be overwhelming. As the maintainers of dbt, we will be learning from state-of-the-art projects tackling this problem and sharing guidance as we develop it. -- **These capabilities are very new.** As data warehouses develop new features, we expect them to offer cheaper, faster, and more intuitive mechanisms for deploying Python transformations. **We reserve the right to change the underlying implementation for executing Python models in future releases.** Our commitment to you is around the code in your model `.py` files, following the documented capabilities and guidance we're providing here. - -As a general rule, if there's a transformation you could write equally well in SQL or Python, we believe that well-written SQL is preferable: it's more accessible to a greater number of colleagues, and it's easier to write code that's performant at scale. If there's a transformation you _can't_ write in SQL, or where ten lines of elegant and well-annotated Python could save you 1000 lines of hard-to-read Jinja-SQL, Python is the way to go. - -## Specific data platforms - -In their initial launch, Python models are supported on three of the most popular data platforms: Snowflake, Databricks, and BigQuery/GCP (via Dataproc). Both Databricks and GCP's Dataproc use PySpark as the processing framework. Snowflake uses its own framework, Snowpark, which has many similarities to PySpark. - - - -
- -**Additional setup:** You will need to [acknowledge and accept Snowflake Third Party Terms](https://docs.snowflake.com/en/developer-guide/udf/python/udf-python-packages.html#getting-started) to use Anaconda packages. - -**Installing packages:** Snowpark supports several popular packages via Anaconda. The complete list is at https://repo.anaconda.com/pkgs/snowflake/. Packages are installed at the time your model is being run. Different models can have different package dependencies. If you are using third-party packages, Snowflake recommends using a dedicated virtual warehouse for best performance rather than one with many concurrent users. - -**About "sprocs":** dbt submits Python models to run as "stored procedures," which some people call "sprocs" for short. By default, dbt will create a named sproc containing your model's compiled Python code, and then "call" it to execute. Snowpark has a Private Preview feature for "temporary" or "anonymous" stored procedures ([docs](https://docs.snowflake.com/en/LIMITEDACCESS/call-with.html)), which are faster and leave a cleaner query history. If this feature is enabled for your account, you can switch it on for your models by configuring `use_anonymous_sproc: True`. We plan to switch this on for all dbt + Snowpark Python models in a future release. - - - -```yml -# I asked Snowflake Support to enable this Private Preview feature, -# and now my dbt-py models run even faster! -models: - use_anonymous_sproc: True -``` - - - -**Docs:** ["Developer Guide: Snowpark Python"](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html) - -
- -
- -**Submission methods:** Databricks supports a few different mechanisms to submit PySpark code, each with relative advantages. Some are better for supporting iterative development, while others are better for supporting lower-cost production deployments. The options are: -- `all_purpose_cluster` (default): dbt will run your Python model using the cluster ID configured as `cluster` in your connection profile or for this specific model. These clusters are more expensive but also much more responsive. We recommend using an interactive all-purpose cluster for quicker iteration in development. - - `create_notebook: True`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using the all-purpose cluster. The appeal of this approach is that you can easily open the notebook in the Databricks UI for debugging or fine-tuning right after running your model. Remember to copy any changes into your dbt `.py` model code before re-running. - - `create_notebook: False` (default): dbt will use the [Command API](https://docs.databricks.com/dev-tools/api/1.2/index.html#run-a-command), which is slightly faster. -- `job_cluster`: dbt will upload your model's compiled PySpark code to a notebook in the namespace `/Shared/dbt_python_model/{schema}`, where `{schema}` is the configured schema for the model, and execute that notebook to run using a short-lived jobs cluster. For each Python model, Databricks will need to spin up the cluster, execute the model's PySpark transformation, and then spin down the cluster. As such, job clusters take longer before and after model execution, but they're also less expensive, so we recommend these for longer-running Python models in production. To use the `job_cluster` submission method, your model must be configured with `job_cluster_config`, which defines key-value properties for `new_cluster`, as defined in the [JobRunsSubmit API](https://docs.databricks.com/dev-tools/api/latest/jobs.html#operation/JobsRunsSubmit). - -You can configure each model's `submission_method` in all the standard ways you supply configuration: - -```python -def model(dbt, session): - dbt.config( - submission_method="all_purpose_cluster", - create_notebook=True, - cluster_id="abcd-1234-wxyz" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: job_cluster - job_cluster_config: - spark_version: ... - node_type_id: ... -``` -```yml -# dbt_project.yml -models: - project_name: - subfolder: - # set defaults for all .py models defined in this subfolder - +submission_method: all_purpose_cluster - +create_notebook: False - +cluster_id: abcd-1234-wxyz -``` - -If not configured, `dbt-spark` will use the built-in defaults: the all-purpose cluster (based on `cluster` in your connection profile) without creating a notebook. The `dbt-databricks` adapter will default to the cluster configured in `http_path`. We encourage explicitly configuring the clusters for Python models in Databricks projects. - -**Installing packages:** When using all-purpose clusters, we recommend installing packages which you will be using to run your Python models. - -**Docs:** -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) -- [Databricks: Introduction to DataFrames - Python](https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-python.html) - -
- -
- -The `dbt-bigquery` adapter uses a service called Dataproc to submit your Python models as PySpark jobs. That Python/PySpark code will read from your tables and views in BigQuery, perform all computation in Dataproc, and write the final result back to BigQuery. - -**Submission methods.** Dataproc supports two submission methods: `serverless` and `cluster`. Dataproc Serverless does not require a ready cluster, which saves on hassle and cost—but it is slower to start up, and much more limited in terms of available configuration. For example, Dataproc Serverless supports only a small set of Python packages, though it does include `pandas`, `numpy`, and `scikit-learn`. (See the full list [here](https://cloud.google.com/dataproc-serverless/docs/guides/custom-containers#example_custom_container_image_build), under "The following packages are installed in the default image"). Whereas, by creating a Dataproc Cluster in advance, you can fine-tune the cluster's configuration, install any PyPI packages you want, and benefit from faster, more responsive runtimes. - -Use the `cluster` submission method with dedicated Dataproc clusters you or your organization manage. Use the `serverless` submission method to avoid managing a Spark cluster. The latter may be quicker for getting started, but both are valid for production. - -**Additional setup:** -- Create or use an existing [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) -- Enable Dataproc APIs for your project + region -- If using the `cluster` submission method: Create or use an existing [Dataproc cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster) with the [Spark BigQuery connector initialization action](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/connectors#bigquery-connectors). (Google recommends copying the action into your own Cloud Storage bucket, rather than using the example version shown in the screenshot below.) - - - -The following configurations are needed to run Python models on Dataproc. You can add these to your [BigQuery profile](/reference/warehouse-setups/bigquery-setup#running-python-models-on-dataproc), or configure them on specific Python models: -- `gcs_bucket`: Storage bucket to which dbt will upload your model's compiled PySpark code. -- `dataproc_region`: GCP region in which you have enabled Dataproc (for example `us-central1`) -- `dataproc_cluster_name`: Name of Dataproc cluster to use for running Python model (executing PySpark job). Only required if `submission_method: cluster`. - -```python -def model(dbt, session): - dbt.config( - submission_method="cluster", - dataproc_cluster_name="my-favorite-cluster" - ) - ... -``` -```yml -version: 2 -models: - - name: my_python_model - config: - submission_method: serverless -``` - -Any user or service account that runs dbt Python models will need the following permissions, in addition to permissions needed for BigQuery ([docs](https://cloud.google.com/dataproc/docs/concepts/iam/iam)): -``` -dataproc.clusters.use -dataproc.jobs.create -dataproc.jobs.get -dataproc.operations.get -storage.buckets.get -storage.objects.create -storage.objects.delete -``` - -**Installing packages:** If you are using a Dataproc Cluster (as opposed to Dataproc Serverless), you can add third-party packages while creating the cluster. - -Google recommends installing Python packages on Dataproc clusters via initialization actions: -- [How initialization actions are used](https://github.com/GoogleCloudDataproc/initialization-actions/blob/master/README.md#how-initialization-actions-are-used) -- [Actions for installing via `pip` or `conda`](https://github.com/GoogleCloudDataproc/initialization-actions/tree/master/python) - -You can also install packages at cluster creation time by [defining cluster properties](https://cloud.google.com/dataproc/docs/tutorials/python-configuration#image_version_20): `dataproc:pip.packages` or `dataproc:conda.packages`. - - - -**Docs:** -- [Dataproc overview](https://cloud.google.com/dataproc/docs/concepts/overview) -- [PySpark DataFrame syntax](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) - -
- -
- -
diff --git a/website/docs/docs/building-a-dbt-project/dont-nest-your-curlies.md b/website/docs/docs/building-a-dbt-project/dont-nest-your-curlies.md index 330a91a8a54..5d410be0f68 100644 --- a/website/docs/docs/building-a-dbt-project/dont-nest-your-curlies.md +++ b/website/docs/docs/building-a-dbt-project/dont-nest-your-curlies.md @@ -80,4 +80,4 @@ Code like this is both valid, and encouraged: {{ config(post_hook="grant select on {{ this }} to role bi_role") }} ``` -So why are curlies inside of curlies allowed in this case? Here, we actually _want_ the string literal `"grant select on {{ this }} ..."` to be saved as the configuration value for the post-hook in this model. This string will be re-rendered when the model runs, resulting in a sensible sql expressions like `grant select on "schema"."table"....` being executed against the database. These hooks are a special exception to the rule stated above. +So why are curlies inside of curlies allowed in this case? Here, we actually _want_ the string literal `"grant select on {{ this }} ..."` to be saved as the configuration value for the post-hook in this model. This string will be re-rendered when the model runs, resulting in a sensible SQL expression like `grant select on "schema"."table"....` being executed against the database. These hooks are a special exception to the rule stated above. diff --git a/website/docs/docs/cloud/about-cloud-develop-defer.md b/website/docs/docs/cloud/about-cloud-develop-defer.md new file mode 100644 index 00000000000..1c0b316f885 --- /dev/null +++ b/website/docs/docs/cloud/about-cloud-develop-defer.md @@ -0,0 +1,55 @@ +--- +title: Using defer in dbt Cloud +id: about-cloud-develop-defer +description: "Learn how to leverage defer to prod when developing with dbt Cloud." +sidebar_label: "Using defer in dbt Cloud" +pagination_next: "docs/cloud/cloud-cli-installation" +--- + + +[Defer](/reference/node-selection/defer) is a powerful feature that allows developers to only build and run and test models they've edited without having to first run and build all the models that come before them (upstream parents). This is powered by using a production manifest for comparison, and dbt will resolve the `{{ ref() }}` function with upstream production artifacts. + +Both the dbt Cloud IDE and the dbt Cloud CLI allow users to natively defer to production metadata directly in their development workflows, dramatically reducing development time and warehouse spend by preventing unnecessary model builds. + +## Required setup + +- You must select the **[Production environment](/docs/deploy/deploy-environments#set-as-production-environment-beta)** checkbox in the **Environment Settings** page. + - This can be set for one deployment environment per dbt Cloud project. +- You must have a successful job run first. + +When using 'defer', it compares artifacts from the most recent successful production job, excluding CI jobs. + +### Defer in the dbt Cloud IDE + +To enable 'Defer' in the dbt Cloud IDE, toggle the **Defer to production** button on the command bar. Once enabled, dbt Cloud will: + +1. Pull down the most recent manifest from the Production environment for comparison +2. Pass the `--defer` flag to the command (for any command that accepts the flag) + +For example, if you were to start developing on a new branch with [nothing in your development schema](/reference/node-selection/defer#usage), edit a single model, and run `dbt build -s state:modified` — only the edited model would run. Any `{{ ref() }}` functions will point to the production location of the referenced models. + + + +### Defer in dbt Cloud CLI + +One key difference between using `--defer` in the dbt Cloud CLI and the dbt Cloud IDE is that `--defer` is *automatically* enabled in the dbt Cloud CLI for all invocations, comparing with production artifacts. You can disable it with the `--no-defer` flag. + +The dbt Cloud CLI offers additional flexibility by letting you choose the source environment for deferral artifacts. You can set a `defer-env-id` key in either your `dbt_project.yml` or `dbt_cloud.yml` file. If you do not provide a `defer-env-id` setting, the dbt Cloud CLI will use artifacts from your dbt Cloud environment marked 'Production'. + + + + ```yml +dever-env-id: '123456' +``` + + + + + + +```yml +dbt_cloud: + dever-env-id: '123456' +``` + + diff --git a/website/docs/docs/cloud/about-cloud-develop.md b/website/docs/docs/cloud/about-cloud-develop.md new file mode 100644 index 00000000000..9f864ede5ca --- /dev/null +++ b/website/docs/docs/cloud/about-cloud-develop.md @@ -0,0 +1,33 @@ +--- +title: About developing in dbt Cloud +id: about-cloud-develop +description: "Learn how to develop your dbt projects using dbt Cloud." +sidebar_label: "About developing in dbt Cloud" +pagination_next: "docs/cloud/cloud-cli-installation" +hide_table_of_contents: true +--- + +dbt Cloud offers a fast and reliable way to work on your dbt project. It runs dbt Core in a hosted (single or multi-tenant) environment. You can develop in your browser using an integrated development environment (IDE) or in a dbt Cloud-powered command line interface (CLI): + +
+ + + + + +

+ +The following sections provide detailed instructions on setting up the dbt Cloud CLI and dbt Cloud IDE. To get started with dbt development, you'll need a [developer](/docs/cloud/manage-access/seats-and-users) account. For a more comprehensive guide about developing in dbt, refer to our [quickstart guides](/quickstarts). + + +--------- +**Note**: The dbt Cloud CLI and the open-sourced dbt Core are both command line tools that let you run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). + diff --git a/website/docs/docs/cloud/about-cloud-setup.md b/website/docs/docs/cloud/about-cloud-setup.md new file mode 100644 index 00000000000..7b68b52a45a --- /dev/null +++ b/website/docs/docs/cloud/about-cloud-setup.md @@ -0,0 +1,28 @@ +--- +title: About dbt Cloud setup +id: about-cloud-setup +description: "Configuration settings for dbt Cloud." +sidebar_label: "About dbt Cloud setup" +pagination_next: "docs/dbt-cloud-environments" +pagination_prev: null +--- + +dbt Cloud is the fastest and most reliable way to deploy your dbt jobs. It contains a myriad of settings that can be configured by admins, from the necessities (data platform integration) to security enhancements (SSO) and quality-of-life features (RBAC). This portion of our documentation will take you through the various settings found by clicking on the gear icon in the dbt Cloud UI, including: + +- [Connecting to a data platform](/docs/cloud/connect-data-platform/about-connections) +- Configuring access to [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or your own [git repo URL](/docs/cloud/git/import-a-project-by-git-url). +- [Managing users and licenses](/docs/cloud/manage-access/seats-and-users) +- [Configuring secure access](/docs/cloud/manage-access/about-user-access) +- Configuring the [dbt Cloud IDE](/docs/cloud/about-cloud-develop) +- Installing and configuring the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) + +These settings are intended for dbt Cloud administrators. If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](/quickstarts). + +If you want a more in-depth learning experience, we recommend taking the dbt Fundamentals on our [dbt Learn online courses site](https://courses.getdbt.com/). + +## Prerequisites +- To set up dbt Cloud, you'll need to have a dbt Cloud account with administrator access. If you still need to create a dbt Cloud account, [sign up today](https://getdbt.com) on our North American servers or [contact us](https://getdbt.com/contact) for international options. +- To have the best experience using dbt Cloud, we recommend you use modern and up-to-date web browsers like Chrome, Safari, Edge, and Firefox. + + + diff --git a/website/docs/docs/cloud/about-cloud/about-cloud-ide.md b/website/docs/docs/cloud/about-cloud/about-cloud-ide.md new file mode 100644 index 00000000000..7643928feec --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/about-cloud-ide.md @@ -0,0 +1,31 @@ +--- +title: "About dbt Cloud IDE" +id: about-cloud-ide +description: "about dbt Cloud Integrated Development Environment" +sidebar_label: About dbt Cloud IDE +--- + +The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. + + +With the Cloud IDE, you can: + +- Write modular SQL models with select statements and the ref() function, +- Compile dbt code into SQL and execute it against your database directly, +- Test every model before deploying them to production, +- Generate and view documentation of your dbt project, +- Leverage git and version-control your code from your browser with a couple of clicks, +- Create and test Python models: + * Compile Python models to see the full function that gets executed in your data platform + * See Python models in DAG in dbt version 1.3 and higher + * Currently, you can't preview python models +- Visualize a directed acyclic graph (DAG), and more. + + + +For more information, read the complete [Cloud IDE guide](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). + +## Related docs + +- [IDE user interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) +- [Tips and tricks](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips) diff --git a/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md new file mode 100644 index 00000000000..71f3175a108 --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/about-dbt-cloud.md @@ -0,0 +1,104 @@ +--- +title: "dbt Cloud features" +id: "dbt-cloud-features" +sidebar_label: "dbt Cloud features" +description: "Explore dbt Cloud's features and learn why dbt Cloud is the fastest way to deploy dbt" +hide_table_of_contents: true +pagination_next: "docs/cloud/about-cloud/architecture" +pagination_prev: null +--- + +dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, document, and investigate data models all in one browser-based UI. + +In addition to providing a hosted architecture for running dbt across your organization, dbt Cloud comes equipped with turnkey support for scheduling jobs, CI/CD, hosting documentation, monitoring and alerting, an integrated development environment (IDE), and allows you to develop and run dbt commands from your local command line interface (CLI) or code editor. + +dbt Cloud's [flexible plans](https://www.getdbt.com/pricing/) and features make it well-suited for data teams of any size — sign up for your [free 14-day trial](https://www.getdbt.com/signup/)! + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +

+ +*These features are available on [selected plans](https://www.getdbt.com/pricing/). +## Related docs + +- [dbt Cloud plans and pricing](https://www.getdbt.com/pricing/) +- [Quickstart guides](/quickstarts) +- [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) + diff --git a/website/docs/docs/cloud/about-cloud/architecture.md b/website/docs/docs/cloud/about-cloud/architecture.md new file mode 100644 index 00000000000..52614f0cbcd --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/architecture.md @@ -0,0 +1,68 @@ +--- +title: "dbt Cloud Architecture" +id: "architecture" +description: "Information about the architecture, communication, and security of dbt Cloud" +--- + +This page helps practitioners and those interested in dbt Cloud's architecture and data flow. + +## About dbt Cloud architecture + +The dbt Cloud application has two types of components: static and dynamic. The static components are always running to serve highly available dbt Cloud functions, like the dbt Cloud web application. On the other hand, the dynamic components are created ad-hoc to handle tasks such as background jobs or requests to use the IDE. + +dbt Cloud is available in most regions around the world in both [single tenant](/docs/cloud/about-cloud/tenancy#single-tenant) (AWS and Azure) and [multi-tenant](/docs/cloud/about-cloud/tenancy#multi-tenant) configurations. + +dbt Cloud uses PostgreSQL for its backend, S3-compatible Object Storage systems for logs and artifacts, and a Kubernetes storage solution for creating dynamic, persistent volumes. + +All data at rest on dbt Cloud servers is protected using AES-256 encryption. + + + +For a more detailed breakdown of the dbt Cloud apps, [download the advanced architecture guide PDF](https://drive.google.com/uc?export=download&id=1lktNuMZybXfqFtr24J8zAssEfoL9r51S). + +## Communication + +dbt Cloud can communicate with several external services, including data platforms, git repositories, authentication services, and directories. All communications occur over HTTPS (attempts to connect via HTTP are redirected to HTTPS). dbt Cloud encrypts in transit using the TLS 1.2 cryptographic protocol. + +TLS (Transport Layer Security) 1.2 is an industry-standard protocol for encrypting sensitive data while it travels over the public internet (which does not offer native encryption). + +A typical scenario that might be seen frequently is an employee working in a public space, such as an airport or café. The user might be connected to an unsecured public network offered by a facility to which many others are also connected. What if there is a bad actor amongst them running a program that can "capture" network packets and analyze them over the air? + +When that user is accessing dbt Cloud and running models that interact with the data platform, the information sent to and from their computer and the services is encrypted with TLS 1.2. + +If that user runs a command that initializes communication between dbt Cloud and the data warehouse (or a git repo or an auth service) over the internet, that communication is also encrypted. This means that while the bad actor can technically see the traffic moving over that unsecured network, they can't read or otherwise parse any information. They will not be able to eavesdrop on or hack the information in any way whatsoever. They would see a nonsensical set of characters that nobody can decrypt. + +For more detailed information on our security practices, read our [Security page](https://getdbt.com/security). + +### Data warehouse interaction + +dbt Cloud's primary role is as a data processor, not a data store. The dbt Cloud application enables users to dispatch SQL to the warehouse for transformation. However, users can post SQL that returns customer data into the dbt Cloud application. This data never persists and will only exist in memory on the instance for the duration of the session. To lock down customer data correctly, proper permissions must be applied to prevent improper access or storage of sensitive data. + +Some data warehouse providers offer advanced security features that can be leveraged in dbt Cloud. [PrivateLink](/docs/cloud/secure/about-privatelink) allows supported data platforms on AWS to communicate with dbt Cloud without the traffic traversing the public internet. [Snowflake](/docs/cloud/manage-access/set-up-snowflake-oauth) and [BigQuery](/docs/cloud/manage-access/set-up-bigquery-oauth) offer Oauth integration which adds a layer of security for the data platforms (Enterprise plan only). + +### Git sync + +dbt Cloud can sync with a variety of git providers, including [Github](/docs/cloud/git/connect-github), [Gitlab](/docs/cloud/git/connect-gitlab), and [Azure DevOps](/docs/cloud/git/connect-azure-devops) within its integrated development environment ([IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud)). Communication takes place over HTTPS rather than SSH and is protected using the TLS 1.2 protocol for data in transit. + +The git repo information is stored on dbt Cloud servers to make it accessible during the IDE sessions. When the git sync is disabled, you must [contact support](mailto:support@getdbt.com) to request the deletion of the synced data. + +### Authentication services + +The default settings of dbt Cloud enable local users with credentials stored in dbt Cloud. Still, integrations with various authentication services are offered as an alternative, including [single sign-on services](/docs/cloud/manage-access/sso-overview). Access to features can be granted/restricted by role using [RBAC](/docs/cloud/manage-access/enterprise-permissions). + +SSO features are essential because they reduce the number of credentials a user must maintain. Users sign in once and the authentication token is shared among integrated services (such as dbt Cloud). The token expires and must be refreshed at predetermined intervals, requiring the user to go through the authentication process again. If the user is disabled in the SSO provider service, their access to dbt Cloud is disabled, and they cannot override this with local auth credentials. + +[Snowflake](/docs/cloud/manage-access/set-up-snowflake-oauth) and [BigQuery](/docs/cloud/manage-access/set-up-bigquery-oauth) offer OAuth (JSON to pass info and API calls for auth) services as an alternative to SAML (XML to pass info and session cookies for auth). Users can authenticate against the data platform for secure access to dbt Cloud and prevent access when credentials are revoked. + +## Security + +dbt Labs is dedicated to upholding industry standards for Cloud security and GDPR compliance. Our compliance certifications include the following: + +- SOC2 Type II — assesses a service provider’s security control environment against the trust services principles and criteria set forth by the American Institute of Certified Public Accountants (AICPA). +- ISO27001:2013 — a globally recognized standard for establishing and certifying an information security management system (ISMS). +- GDPR - dbt Labs is committed to maintaining GDPR compliance standards. Read more about our [Data Processing Addendum](https://www.getdbt.com/cloud/dpa). + + +For more detailed information about our security practices, read our [Security page](https://www.getdbt.com/security/). + + diff --git a/website/docs/docs/cloud/about-cloud/assets/dbt-cloud-advanced-architecture-guide.pdf b/website/docs/docs/cloud/about-cloud/assets/dbt-cloud-advanced-architecture-guide.pdf new file mode 100644 index 00000000000..33f7449ad50 Binary files /dev/null and b/website/docs/docs/cloud/about-cloud/assets/dbt-cloud-advanced-architecture-guide.pdf differ diff --git a/website/docs/docs/cloud/about-cloud/browsers.md b/website/docs/docs/cloud/about-cloud/browsers.md new file mode 100644 index 00000000000..12665bc7b72 --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/browsers.md @@ -0,0 +1,30 @@ +--- +title: "Supported browsers" +id: "browsers" +description: "dbt Cloud supports the latest browsers like Chrome and Firefox." +pagination_next: null +--- + +To have the best experience with dbt Cloud, we recommend using the latest versions of the following browsers: + +- [Google Chrome](https://www.google.com/chrome/) — Latest version is fully supported in dbt Cloud +- [Mozilla Firefox](https://www.mozilla.org/en-US/firefox/) — Latest version is fully supported in dbt Cloud +- [Apple Safari](https://www.apple.com/safari/) — Latest version support provided on a best-effort basis +- [Microsoft Edge](https://www.microsoft.com/en-us/edge?form=MA13FJ&exp=e00) — Latest version support provided on a best-effort basis + +dbt Cloud provides two types of browser support: + +- Fully supported — dbt Cloud is fully tested and supported on these browsers. Features display and work as intended. +- Best effort — You can access dbt Cloud on these browsers. Features may not display or work as intended. + +You may still be able to access and use dbt Cloud even without using the latest recommended browser or an unlisted browser. However, some features might not display as intended. + +:::note +To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. +::: + +### Browser sessions + +A session is a period of time during which you’re signed in to a dbt Cloud account from a browser. If you close your browser, it will end your session and log you out. You'll need to log in again the next time you try to access dbt Cloud. + +If you've logged in using [SSO](/docs/cloud/manage-access/sso-overview) or [OAuth](/docs/cloud/git/connect-github#personally-authenticate-with-github), you can customize your maximum session duration, which might vary depending on your identity provider (IdP). diff --git a/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md new file mode 100644 index 00000000000..caeb0203a5e --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/regions-ip-addresses.md @@ -0,0 +1,27 @@ +--- +title: "Regions & IP addresses" +id: "regions-ip-addresses" +description: "Available regions and ip addresses" +--- + +dbt Cloud is [hosted](/docs/cloud/about-cloud/architecture) in multiple regions and will always connect to your data platform or git provider from the below IP addresses. Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. + +[dbt Cloud Enterprise](https://www.getdbt.com/pricing/) plans can choose to have their account hosted in any of the below regions. Organizations **must** choose a single region per dbt Cloud account. If you need to run dbt Cloud in multiple regions, we recommend using multiple dbt Cloud accounts. + + +| Region | Location | Access URL | IP addresses | Developer plan | Team plan | Enterprise plan | +|--------|----------|------------|--------------|----------------|-----------|-----------------| +| North America multi-tenant [^1] | AWS us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
54.81.134.249
52.22.161.231 | ✅ | ✅ | ✅ | +| North America Cell 1 [^1] | AWS use-east-1 (N.Virginia) | {account prefix}.us1.dbt.com | [Located in Account Settings](#locating-your-dbt-cloud-ip-addresses) | ❌ | ❌ | ❌ | +| EMEA [^1] | AWS eu-central-1 (Frankfurt) | emea.dbt.com | 3.123.45.39
3.126.140.248
3.72.153.148 | ❌ | ❌ | ✅ | +| APAC [^1] | AWS ap-southeast-2 (Sydney)| au.dbt.com | 52.65.89.235
3.106.40.33
13.239.155.206
| ❌ | ❌ | ✅ | +| Virtual Private dbt or Single tenant | Customized | Customized | Ask [Support](/community/resources/getting-help#dbt-cloud-support) for your IPs | ❌ | ❌ | ✅ | + + +[^1]: These regions support [multi-tenant](/docs/cloud/about-cloud/tenancy) deployment environments hosted by dbt Labs. + +### Locating your dbt Cloud IP addresses + +There are two ways to view your dbt Cloud IP addresses: +- If no projects exist in the account, create a new project, and the IP addresses will be displayed during the **Configure your environment** steps. +- If you have an existing project, navigate to **Account Settings** and ensure you are in the **Projects** pane. Click on a project name, and the **Project Settings** window will open. Locate the **Connection** field and click on the name. Scroll down to the **Settings**, and the first text block lists your IP addresses. diff --git a/website/docs/docs/cloud/about-cloud/tenancy.md b/website/docs/docs/cloud/about-cloud/tenancy.md new file mode 100644 index 00000000000..5785533264e --- /dev/null +++ b/website/docs/docs/cloud/about-cloud/tenancy.md @@ -0,0 +1,27 @@ +--- +title: Tenancy +id: tenancy +description: "Information about single tenant and multi-tenant dbt Cloud instances" +--- + +import AboutCloud from '/snippets/_test-tenancy.md'; + + + +### Multi-tenant + +The Multi Tenant (SaaS) deployment environment refers to the SaaS dbt Cloud application hosted by dbt Labs. This is the most commonly used deployment and is completely managed and maintained by dbt Labs, the makers of dbt. As a SaaS product, a user can quickly [create an account](https://www.getdbt.com/signup/) on our North American servers and get started using the dbt and related services immediately. _If your organization requires cloud services hosted on EMEA or APAC regions_, please [contact us](https://www.getdbt.com/contact/). The deployment is hosted on AWS and is always kept up to date with the currently supported dbt versions, software updates, and bug fixes. + +### Single tenant + +The single tenant deployment environment provides a hosted alternative to the multi-tenant (SaaS) dbt Cloud environment. While still managed and maintained by dbt Labs, single tenant dbt Cloud instances provide dedicated infrastructure in a virtual private cloud (VPC) environment. This is accomplished by spinning up all the necessary infrastructure with a re-usable Infrastructure as Code (IaC) deployment built with [Terraform](https://www.terraform.io/). The single tenant infrastructure lives in a dedicated AWS or Azure account and can be customized with certain configurations, such as firewall rules, to limit inbound traffic or hosting in a specific regions. + +A few common reasons for choosing a single tenant deployment over the Production SaaS product include: +- A requirement that the dbt Cloud application be hosted in a dedicated VPC that is logically separated from other customer infrastructure +- A desire for multiple isolated dbt Cloud instances for testing, development, etc + +_To learn more about setting up a dbt Cloud single tenant deployment, [please contact our sales team](mailto:sales@getdbt.com)._ + +### Available features + + diff --git a/website/docs/docs/cloud/billing.md b/website/docs/docs/cloud/billing.md new file mode 100644 index 00000000000..1d71d33e9a1 --- /dev/null +++ b/website/docs/docs/cloud/billing.md @@ -0,0 +1,260 @@ +--- +title: "Billing" +id: billing +description: "dbt Cloud billing information." +sidebar_label: Billing +pagination_next: null +pagination_prev: null +--- + +dbt Cloud offers a variety of [plans and pricing](https://www.getdbt.com/pricing/) to fit your organization’s needs. With flexible billing options that appeal to large enterprises and small businesses and [server availability](/docs/cloud/about-cloud/regions-ip-addresses) worldwide, dbt Cloud is the fastest and easiest way to begin transforming your data. + +## How does dbt Cloud pricing work? + +As a customer, you pay for the number of seats you have and the amount of usage consumed each month. Seats are billed primarily on the amount of Developer and Read licenses purchased. Usage is based on the number of [Successful Models Built](#what-counts-as-a-successful-model-built) and, if purchased and used, Semantic Layer Query Units subject to reasonable usage. All billing computations are conducted in Coordinated Universal Time (UTC). + +### What counts as a seat license? + +There are three types of possible seat licenses: + +* **Developer** — for roles and permissions that require interaction with the dbt Cloud environment day-to-day. +* **Read-Only** — for access to view certain documents and reports. +* **IT** — for access to specific features related to account management (for example, configuring git integration). + +### What counts as a Successful Model Built? + +dbt Cloud considers a Successful Model Built as any model that is successfully built via a run through dbt Cloud’s orchestration functionality in a dbt Cloud deployment environment. Models are counted when built and run. This includes any jobs run via dbt Cloud's scheduler, CI builds (jobs triggered by pull requests), runs kicked off via the dbt Cloud API, and any successor dbt Cloud tools with similar functionality. This also includes models that are successfully built even when a run may fail to complete. For example, you may have a job that contains 100 models and on one of its runs, 51 models are successfully built and then the job fails. In this situation, only 51 models would be counted. + +Any models built in a dbt Cloud development environment (for example, via the IDE) do not count towards your usage. Tests, seeds, ephemeral models, and snapshots also do not count. + +| What counts towards Successful Models Built | | +|---------------------------------------------|---------------------| +| View | ✅ | +| Table | ✅ | +| Incremental | ✅ | +| Ephemeral Models | ❌ | +| Tests | ❌ | +| Seeds | ❌ | +| Snapshots | ❌ | + +### What counts as a Query Unit?​ + +The dbt Semantic Layer, powered by MetricFlow, measures usage in distinct query units. Every successful request you make to render or run SQL to the Semantic Layer API counts as at least one query unit, even if no data is returned. If the query calculates or renders SQL for multiple metrics, each calculated metric will be counted as a query unit. +If a request to run a query is not executed successfully in the data platform or if a query results in an error without completion, it is not counted as a query unit. Requests for metadata from the Semantic Layer are also not counted as query units. + +Examples of query units include: + +Querying one metric, grouping by one dimension → 1 query unit + +```shell +dbt sl query --metrics revenue --group_by metric_time +``` +Querying one metric, grouping by two dimensions → 1 query unit + +```shell +dbt sl query --metrics revenue --group_by metric_time,user__country +``` + +Querying two metrics, grouping by two dimensions → 2 query units + +```shell +dbt sl query --metrics revenue,gross_sales --group_by metric_time,user__country +``` + +Running an explain for one metric → 1 query unit + +```shell +dbt sl query --metrics revenue --group_by metric_time --explain +``` + +Running an explain for two metrics → 2 query units + +```shell +dbt sl query --metrics revenue,gross_sales --group_by metric_time --explain +``` + +Running a query for only dimensions such as dimension_values or a query with no metrics → 1 query unit + +```shell +bt sl list dimension-values --dimension user__country +``` + +### Viewing usage in the product + +Viewing usage in the product is restricted to specific roles: + +* Team plan — Owner group +* Enterprise plan — Account and billing admin roles + +For an account-level view of usage, if you have access to the **Billing** and **Usage** pages, you can see an estimate of the usage for the month. In the Billing page of the **Account Settings**, you can see how your account tracks against its usage. You can also see which projects are building the most models. + +As a Team and Developer plan user, you can see how the account is tracking against the included models built. As an Enterprise plan user, you can see how much you have drawn down from your annual commit and how much remains. + +On each Project Home page, any user with access to that project can see how many models are built each month. From there, additional details on top jobs by models built can be found on each Environment page. + +In addition, you can look at the Job Details page's Insights tab to show how many models are being built per month for that particular job and which models are taking the longest to build. + +Any usage data is only an estimate of your usage, and there may be a delay in showing usage data in the product — your final usage for the month will be visible on your monthly statements (statements applicable to Team and Enterprise plans). + + +## Plans and Billing + +dbt Cloud offers several [plans](https://www.getdbt.com/pricing) with different features that meet your needs. We may make changes to our plan details from time to time. We'll always let you know in advance, so you can be prepared. The following section explains how billing works in each plan. + +### Developer plan billing + +Developer plans are free and include one Developer license and 3,000 models each month. Models are refreshed at the beginning of each calendar month. If you exceed 3,000 models, any subsequent runs will be canceled until models are refreshed or until you upgrade to a paid plan. The rest of the dbt Cloud platform is still accessible, and no work will be lost. + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Developer Plan, the included model entitlements may be different from what’s reflected above. + + +### Team plan billing + +Team customers pay monthly via credit card for seats and usage, and accounts include 15,000 models monthly. Seats are charged upfront at the beginning of the month. If you add seats during the month, seats will be prorated and charged on the same day. Seats removed during the month will be reflected on the next invoice and are not eligible for refunds. You can change the credit card information and the number of seats from the billings section anytime. Accounts will receive one monthly invoice that includes the upfront charge for the seats and the usage charged in arrears from the previous month. + +Usage is calculated and charged in arrears for the previous month. If you exceed 15,000 models in any month, you will be billed for additional usage on your next invoice. Additional usage is billed at the rates on our [pricing page](https://www.getdbt.com/pricing). + + +Included models that are not consumed do not roll over to future months. You can estimate your bill with a simple formula: + +`($100 x number of developer seats) + ((models built - 15,000) x $0.01)` + +All included successful models built numbers above reflect our most current pricing and packaging. Based on your usage terms when you signed up for the Team Plan, the included model entitlements may be different from what’s reflected above. + +### Enterprise plan billing + +As an Enterprise customer, you pay annually via invoice, monthly in arrears for additional usage (if applicable), and may benefit from negotiated usage rates. Please refer to your order form or contract for your specific pricing details, or [contact the account team](https://www.getdbt.com/contact-demo) with any questions. + +### Legacy plans + +Customers who purchased the dbt Cloud Team plan before August 11, 2023, remain on a legacy pricing plan as long as your account is in good standing. The legacy pricing plan is based on seats and includes unlimited models, subject to reasonable use. + +:::note Legacy Semantic Layer + +For customers using the legacy Semantic Layer with dbt_metrics package, this product will be deprecated in December 2023. Legacy users may choose to upgrade at any time to the revamped version, Semantic Layer powered by MetricFlow. The revamped version is available to most customers (see [prerequisites](/docs/use-dbt-semantic-layer/quickstart-sl#prerequisites)) for a limited time on a free trial basis, subject to reasonable use. + +::: + +dbt Labs may institute use limits if reasonable use is exceeded. Additional features, upgrades, or updates may be subject to separate charges. Any changes to your current plan pricing will be communicated in advance according to our Terms of Use. + + +## Managing usage + +From anywhere in the dbt Cloud account, click the **gear icon** and click **Account settings**. The **Billing** option will be on the left side menu under the **Account Settings** heading. Here, you can view individual available plans and the features provided for each. + +### Usage notifications + +Every plan automatically sends email alerts when 75%, 90%, and 100% of usage estimates have been reached. In the Team plan, all users within the Owner group will receive alerts. In Enterprise plans, all users with the Account Admin and Billing Admin permission sets will receive alerts. Users cannot opt out of these emails. If you would like additional users to receive these alert emails, please provide them with the applicable permissions mentioned above. Note that your usage may already be higher than the percentage indicated in the alert due to your usage pattern and minor latency times. + +### How do I stop usage from accruing? + +There are 2 options to disable models from being built and charged: + +1. Open the **Job Settings** of every job and navigate to the **Triggers** section. Disable the **Run on Schedule** and set the **Continuous Integration** feature **Run on Pull Requests?** to **No**. Check your workflows to ensure that you are not triggering any runs via the dbt Cloud API. This option will enable you to keep your dbt Cloud jobs without building more models. +2. Alternatively, you can delete some or all of your dbt Cloud jobs. This will ensure that no runs are kicked off, but you will permanently lose your job(s). + + +## Optimize costs in dbt Cloud + +dbt Cloud offers ways to optimize your model’s built usage and warehouse costs. + +### Best practices for optimizing successful models built + +When thinking of ways to optimize your costs from successful models built, there are methods to reduce those costs while still adhering to best practices. To ensure that you are still utilizing tests and rebuilding views when logic is changed, it's recommended to implement a combination of the best practices that fit your needs. More specifically, if you decide to exclude views from your regularly scheduled dbt Cloud job runs, it's imperative that you set up a merge job (with a link to the section) to deploy updated view logic when changes are detected. + +#### Exclude views in a dbt Cloud job + +Many dbt Cloud users utilize views, which don’t always need to be rebuilt every time you run a job. For any jobs that contain views that _do not_ include macros that dynamically generate code (for example, case statements) based on upstream tables and also _do not_ have tests, you can implement these steps: + +1. Go to your current production deployment job in dbt Cloud. +2. Modify your command to include: `-exclude config.materialized:view`. +3. Save your job changes. + +If you have views that contain macros with case statements based on upstream tables, these will need to be run each time to account for new values. If you still need to test your views with each run, follow the [Exclude views while still running tests](#exclude-views-while-running-tests) best practice to create a custom selector. + +#### Exclude views while running tests + +Running tests for views in every job run can help keep data quality intact and save you from the need to rerun failed jobs. To exclude views from your job run while running tests, you can follow these steps to create a custom [selector](https://docs.getdbt.com/reference/node-selection/yaml-selectors) for your job command. + +1. Open your dbt project in the dbt Cloud IDE. +2. Add a file called `selectors.yml` in your top-level project folder. +3. In the file, add the following code: + + ```yaml + selectors: + - name: skip_views_but_test_views + description: > + A default selector that will exclude materializing views + without skipping tests on views. + default: true + definition: + union: + - union: + - method: path + value: "*" + - exclude: + - method: config.materialized + value: view + - method: resource_type + value: test + + ``` + +4. Save the file and commit it to your project. +5. Modify your dbt Cloud jobs to include `--selector skip_views_but_test_views`. + +#### Build only changed views + +If you want to ensure that you're building views whenever the logic is changed, create a merge job that gets triggered when code is merged into main: + +1. Ensure you have a [CI job setup](/docs/deploy/ci-jobs) in your environment. +2. Create a new [deploy job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) and call it “Merge Job". +3. Set the  **Environment** to your CI environment. Refer to [Types of environments](/docs/deploy/deploy-environments#types-of-environments) for more details. +4. Set **Commands** to: `dbt run -s state:modified+`. + Executing `dbt build` in this context is unnecessary because the CI job was used to both run and test the code that just got merged into main. +5. Under the **Execution Settings**, select the default production job to compare changes against: + - **Defer to a previous run state** — Select the “Merge Job” you created so the job compares and identifies what has changed since the last merge. +6. In your dbt project, follow the steps in [Run a dbt Cloud job on merge](/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge) to create a script to trigger the dbt Cloud API to run your job after a merge happens within your git repository or watch this [video](https://www.loom.com/share/e7035c61dbed47d2b9b36b5effd5ee78?sid=bcf4dd2e-b249-4e5d-b173-8ca204d9becb). + +The purpose of the merge job is to: + +- Immediately deploy any changes from PRs to production. +- Ensure your production views remain up-to-date with how they’re defined in your codebase while remaining cost-efficient when running jobs in production. + +The merge action will optimize your cloud data platform spend and shorten job times, but you’ll need to decide if making the change is right for your dbt project. + +### Rework inefficient models + +#### Job Insights tab + +To reduce your warehouse spend, you can identify what models, on average, are taking the longest to build in the **Job** page under the **Insights** tab. This chart looks at the average run time for each model based on its last 20 runs. Any models that are taking longer than anticipated to build might be prime candidates for optimization, which will ultimately reduce cloud warehouse spending. + +#### Model Timing tab + +To understand better how long each model takes to run within the context of a specific run, you can look at the **Model Timing** tab. Select the run of interest on the **Run History** page to find the tab. On that **Run** page, click **Model Timing**. + +Once you've identified which models could be optimized, check out these other resources that walk through how to optimize your work: +* [Build scalable and trustworthy data pipelines with dbt and BigQuery](https://services.google.com/fh/files/misc/dbt_bigquery_whitepaper.pdf) +* [Best Practices for Optimizing Your dbt and Snowflake Deployment](https://www.snowflake.com/wp-content/uploads/2021/10/Best-Practices-for-Optimizing-Your-dbt-and-Snowflake-Deployment.pdf) +* [How to optimize and troubleshoot dbt models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks) + +## FAQs + +* What happens if I need more than 8 seats on the Team plan? +_If you need more than 8 developer seats, select the Contact Sales option from the billing settings to talk to our sales team about an Enterprise plan._ + +* What if I go significantly over my included free models on the Team or Developer plan? +_Consider upgrading to a Team or Enterprise plan. Team plans include more models and allow you to exceed the monthly usage limit. Enterprise accounts are supported by a dedicated account management team and offer annual plans, custom configurations, and negotiated usage rates._ + +* I want to upgrade my plan. Will all of my work carry over? +_Yes. Your dbt Cloud account will be upgraded without impacting your existing projects and account settings._ + +* How do I determine the right plan for me? + _The best option is to consult with our sales team. They'll help you figure out what is right for your needs. We also offer a free two-week trial on the Team plan._ + +* What are the Semantic Layer trial terms? +_Team and Enterprise customers can sign up for a free trial of the dbt Semantic Layer, powered by MetricFlow, for use of up to 1,000 query units per month. The trial will be available at least through January 2024. dbt Labs may extend the trial period in its sole discretion. During the trial period, we may reach out to discuss pricing options or ask for feedback. At the end of the trial, free access may be removed and a purchase may be required to continue use. dbt Labs reserves the right to change limits in a free trial or institute pricing when required or at any time in its sole discretion._ + +* What is the reasonable use limitation for the dbt Semantic Layer powered by MetricFlow during the trial? +_Each account will be limited to 1,000 Queried Metrics per month during the trial period and may be changed at the sole discretion of dbt Labs._ + diff --git a/website/docs/docs/cloud/cloud-cli-installation.md b/website/docs/docs/cloud/cloud-cli-installation.md new file mode 100644 index 00000000000..6d3060d9c23 --- /dev/null +++ b/website/docs/docs/cloud/cloud-cli-installation.md @@ -0,0 +1,232 @@ +--- +title: Install dbt Cloud CLI +sidebar_label: "Install dbt Cloud CLI" +id: cloud-cli-installation +description: "Instructions for installing and configuring dbt Cloud CLI" +pagination_next: "docs/cloud/configure-cloud-cli" +--- + +import CloudCLIFlag from '/snippets/_cloud-cli-flag.md'; + + + + +dbt Cloud natively supports developing using a command line (CLI), empowering team members to contribute with enhanced flexibility and collaboration. The dbt Cloud CLI allows you to run dbt commands against your dbt Cloud development environment from your local command line. + +dbt commands are run against dbt Cloud's infrastructure and benefit from: + +* Secure credential storage in the dbt Cloud platform. +* [Automatic deferral](/docs/cloud/about-cloud-develop-defer) of build artifacts to your Cloud project's production environment. +* Speedier, lower-cost builds. +* Support for dbt Mesh ([cross-project `ref`](/docs/collaborate/govern/project-dependencies)), +* Significant platform improvements, to be released over the coming months. + + +## Prerequisites +The dbt Cloud CLI is available in all [deployment regions](/docs/cloud/about-cloud/regions-ip-addresses) and and for both multi-tenant and single-tenant accounts (Azure single-tenant not supported at this time). + +You must be on dbt version 1.5 or higher. Refer to [dbt Cloud versions](/docs/dbt-versions/upgrade-core-in-cloud) to upgrade. + +## Install dbt Cloud CLI + +You can install the dbt Cloud CLI on the command line by using one of these methods: + + + + + +:::info Use native packages or a virtual environment to avoid overriding dbt Core + +Installing the dbt Cloud CLI with pip replaces dbt Core. This change can be avoided by using the native install method and configuring your PATH or by creating a new virtual environment. + +Otherwise, to switch back to dbt Core, uninstall the dbt Cloud CLI and follow the dbt Core installation instructions. + +::: + +Before installing the dbt Cloud CLI, make sure you have Python installed and your virtual environment venv or pyenv . If you already have a Python environment configured, you can skip to the [pip installation step](#install-dbt-cloud-cli-in-pip). + +### Install a virtual environment + +We recommend using virtual environments (venv) to namespace `cloud-cli`. + +1. Create a new venv: + ```shell + python3 -m venv dbt-cloud + ``` + +2. Activate the virtual environment each time you create a shell window or session: + ```shell + source dbt-cloud/bin/activate # activate the environment for Mac and Linux OR + dbt-env\Scripts\activate # activate the environment for Windows + ``` + +3. (Mac and Linux only) Create an alias to activate your dbt environment with every new shell window or session. You can add the following to your shell's configuration file (for example, $HOME/.bashrc, $HOME/.zshrc) while replacing `` with the path to your virtual environment configuration: + ```shell + alias env_dbt='source /bin/activate' + ``` + +### Install dbt Cloud CLI in pip + +1. (Optional) If you already have dbt Core installed, this installation will override that package. Note your dbt Core version in case you need to reinstall it later: + + ```bash + dbt --version + ``` + +2. Make sure you're in your virtual environment and run the following command to install the dbt Cloud CLI: + + ```bash + pip3 install dbt + ``` + +3. (Optional) To revert back to dbt Core, first uninstall both the dbt Cloud CLI and dbt Core +4. Reinstall dbt Core using the version from Step 2. + + ```bash + pip3 uninstall dbt-core dbt + pip3 install dbt-core==VERSION + ``` + + + + + +Before you begin, make sure you have [Homebrew installed](http://brew.sh/) in your code editor or command line terminal. Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + + +1. Run the following command to verify that there is no conflict with a dbt Core installation on your system: + + ```bash + which dbt + ``` + - This should return a `dbt not found`. If the dbt help text appears, use `pip uninstall dbt` to deactivate dbt Core from your machine. + +2. Install the dbt Cloud CLI with Homebrew: + + ```bash + brew untap dbt-labs/dbt + brew tap dbt-labs/dbt-cli + brew install dbt + ``` + +3. Verify the installation by running `dbt --help` from the command line. If the help text doesn't indicate that you're using the dbt Cloud CLI, make sure you've deactivated your pyenv or venv and don't have a version of dbt globally installed. + * You don't have to run the `dbt deps` command when your environment starts. Previously, you had to do it during initialization. However, you'll still need to run `dbt deps` if you make changes to your `packages.yml` file. + + + + + +Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + +1. Download the latest Windows release for your platform from [GitHub](https://github.com/dbt-labs/dbt-cli/releases). + +2. Extract the `dbt.exe` executable into the same folder as your dbt project. + +:::info + +Advanced users can configure multiple projects to use the same dbt Cloud CLI by placing the executable in the Program Files folder and [adding it to their Windows PATH environment variable](https://medium.com/@kevinmarkvi/how-to-add-executables-to-your-path-in-windows-5ffa4ce61a53). + +Note that if you are using VS Code, you must restart it to pick up modified environment variables. +::: + +3. Verify the installation by running `./dbt --help` from the command line. If the help text doesn't indicate that you're using the dbt Cloud CLI, make sure you've deactivated your pyenv or venv and don't have a version of dbt globally installed. + * You don't have to run the `dbt deps` command when your environment starts. Previously, you had to do it during initialization. However, you'll still need to run `dbt deps` if you make changes to your `packages.yml` file. + + + + + +Refer to the [FAQs](#faqs) if your operating system runs into path conflicts. + +1. Download the latest Linux release for your platform from [GitHub](https://github.com/dbt-labs/dbt-cli/releases). (Pick the file based on your CPU architecture) + +2. Extract the `dbt-cloud-cli` binary to the same folder as your dbt project. + + ```bash + tar -xf dbt_0.29.9_linux_amd64.tar.gz + ./dbt --version + ``` + +:::info + +Advanced users can configure multiple projects to use the same Cloud CLI executable by adding it to their PATH environment variable in their shell profile. + +::: + +3. Verify the installation by running `./dbt --help` from the command line. If the help text doesn't indicate that you're using the dbt Cloud CLI, make sure you've deactivated your pyenv or venv and don't have a version of dbt globally installed. + * You don't have to run the `dbt deps` command when your environment starts. Previously, you had to do it during initialization. However, you'll still need to run `dbt deps` if you make changes to your `packages.yml` file. + + + + + + +## Update dbt Cloud CLI + +The following instructions explain how to update the dbt CLoud CLI to the latest version depending on your operating system. + +During the public preview period, we recommend updating before filing a bug report. This is because the API is subject to breaking changes. + + + + + +To update: +- Make sure you're in your virtual environment +- Run `pip install --upgrade dbt`. + + + + + +To update the dbt Cloud CLI, run `brew upgrade dbt`. (You can also use `brew install dbt`). + + + + + +To update, follow the same process explained in [Windows](/docs/cloud/cloud-cli-installation?install=windows#install-dbt-cloud-cli) and replace the existing `dbt.exe` executable with the new one. + + + + + +To update, follow the same process explained in [Windows](/docs/cloud/cloud-cli-installation?install=linux#install-dbt-cloud-cli) and replace the existing `dbt` executable with the new one. + + + + + + +## Next steps + +After installation, you can [configure](/docs/cloud/configure-cloud-cli) the dbt Cloud CLI for your dbt Cloud project and use it to run [dbt commands](/reference/dbt-commands) similar to dbt Core. + +For example, you can execute `dbt compile` to compile a project using dbt Cloud. + +Note, that if you're using the dbt Cloud CLI, you can connect to your data platform directly in the dbt Cloud interface and don't need a [`profiles.yml`](/docs/core/connect-data-platform/profiles.yml) file. + + +## FAQs + +
+ +What's the difference between the dbt Cloud CLI and dbt Core? +The dbt Cloud CLI and dbt Core, an open-source project, are both command line tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its features. + +
+ +
+How do I run both the dbt Cloud CLI and dbt Core? +For compatibility, both the dbt Cloud CLI and dbt Core are invoked by running dbt. This can create path conflicts if your operating system selects one over the other based on your $PATH environment variable (settings). + +If you have dbt Core installed locally, either: + +1. Install using [pip](/docs/cloud/cloud-cli-installation?install=pip#install-dbt-cloud-cli). + +2. Install natively, but ensure that you deactivate your Python environment or uninstall it using `pip uninstall dbt` before proceeding. + +3. (Advanced users) Install natively, but modify the $PATH environment variable to correctly point to the dbt Cloud CLI binary to use both dbt Cloud CLI and dbt Core together. + +You can always uninstall the dbt Cloud CLI to return to using dbt Core. +
diff --git a/website/docs/docs/cloud/configure-cloud-cli.md b/website/docs/docs/cloud/configure-cloud-cli.md new file mode 100644 index 00000000000..f13a4d267c3 --- /dev/null +++ b/website/docs/docs/cloud/configure-cloud-cli.md @@ -0,0 +1,92 @@ +--- +title: Configure dbt Cloud CLI +id: configure-cloud-cli +description: "Instructions on how to configure the dbt Cloud CLI" +sidebar_label: "Configure dbt Cloud CLI" +pagination_next: null +--- + +import CloudCLIFlag from '/snippets/_cloud-cli-flag.md'; + + + + +## Prerequisites + +- You must set up a project in dbt Cloud. + - **Note** — If you're using the dbt Cloud CLI, you can connect to your data platform directly in the dbt Cloud interface and don't need a [`profiles.yml`](/docs/core/connect-data-platform/profiles.yml) file. +- You must have your [personal development credentials](/docs/dbt-cloud-environments#set-developer-credentials) set for that project. The dbt Cloud CLI will use these credentials, stored securely in dbt Cloud, to communicate with your data platform. +- You must [enroll](/docs/dbt-versions/experimental-features) in the dbt Cloud beta features. + - To enroll, navigate to your **Profile Settings** and enable the **Beta** flag under **Experimental Features**. +- You must be on dbt version 1.5 or higher. Refer to [dbt Cloud versions](/docs/dbt-versions/upgrade-core-in-cloud) to upgrade. + +## Configure the dbt Cloud CLI + +Once you install the dbt Cloud CLI, you need to configure it to connect to a dbt Cloud project. + +1. Ensure you meet the prerequisites above. + +2. Download your credentials from dbt Cloud by clicking on the **Try the dbt Cloud CLI** banner on the dbt Cloud homepage. Alternatively, if you're in dbt Cloud, you can download the credentials from the links provided based on your region: + + - North America: https://cloud.getdbt.com/cloud-cli + - EMEA: https://emea.dbt.com/cloud-cli + - APAC: https://apac.dbt.com/cloud-cli + - North American Cell 1: `https:/ACCOUNT_PREFIX.us1.dbt.com/cloud-cli` + - Single-tenant: `https://YOUR_ACCESS_URL/cloud-cli` + +3. Follow the banner instructions and download the config file to: + - Mac or Linux: `~/.dbt/dbt_cloud.yml` + - Windows: `C:\Users\yourusername\.dbt\dbt_cloud.yml` + + The config file looks like this: + + ```yaml + version: "1" + context: + active-project: "" + active-host: "" + defer-env-id: "" + projects: + - project-id: "" + account-host: "" + api-key: "" + + - project-id: "" + account-host: "" + api-key: "" + + ``` + +4. After downloading the config file, navigate to a dbt project in your terminal: + + ```bash + cd ~/dbt-projects/jaffle_shop + ``` + +5. In your `dbt_project.yml` file, ensure you have or include a `dbt-cloud` section with a `project-id` field. The `project-id` field contains the dbt Cloud project ID you want to use. + + ```yaml + # dbt_project.yml + name: + + version: + ... + + dbt-cloud: + project-id: PROJECT_ID + ``` + + - To find your project ID, select **Develop** in the dbt Cloud navigation menu. You can use the URL to find the project ID. For example, in `https://cloud.getdbt.com/develop/26228/projects/123456`, the project ID is `123456`. + +## Use the dbt Cloud CLI + +- The dbt Cloud CLI shares the same set of [dbt commands](/reference/dbt-commands) as dbt Core and processes the commands you invoke. +- It allows you to use automatic deferral of build artifacts to your Cloud project's production environment. +- It also supports [project dependencies](/docs/collaborate/govern/project-dependencies), which allows you to depend on another project using the metadata service in dbt Cloud. + - Project dependencies instantly connects to and references (or `ref`) public models defined in other projects. This means you don't need to execute or analyze these upstream models yourself. Instead, you treat them as an API that returns a dataset. + +:::tip Use the --help flag +As a tip, most command-line tools have a `--help` flag to show available commands and arguments. Use the `--help` flag with dbt in two ways: +- `dbt --help`: Lists the commands available for dbt
+- `dbt run --help`: Lists the flags available for the `run` command +::: diff --git a/website/docs/docs/cloud/connect-data-platform/about-connections.md b/website/docs/docs/cloud/connect-data-platform/about-connections.md new file mode 100644 index 00000000000..1fe89c7273c --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/about-connections.md @@ -0,0 +1,34 @@ +--- +title: "About data platform connections" +id: about-connections +description: "Information about data platform connections" +sidebar_label: "About data platform connections" +pagination_next: "docs/cloud/connect-data-platform/connect-starburst-trino" +pagination_prev: null +--- +dbt Cloud can connect with a variety of data platform providers including: +- [Amazon Redshift](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) +- [Apache Spark](/docs/cloud/connect-data-platform/connect-apache-spark) +- [Databricks](/docs/cloud/connect-data-platform/connect-databricks) +- [Google BigQuery](/docs/cloud/connect-data-platform/connect-bigquery) +- [PostgreSQL](/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb) +- [Snowflake](/docs/cloud/connect-data-platform/connect-snowflake) +- [Starburst or Trino](/docs/cloud/connect-data-platform/connect-starburst-trino) + +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; + + + +You can connect to your database in dbt Cloud by clicking the gear in the top right and selecting **Account Settings**. From the Account Settings page, click **+ New Project**. + + + +These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/quickstarts) + +## IP Restrictions + +dbt Cloud will always connect to your data platform from the IP addresses specified in the [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses) page. + +Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. + +Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/cloud/about-cloud/architecture). diff --git a/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md b/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md new file mode 100644 index 00000000000..0186d821a54 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-apache-spark.md @@ -0,0 +1,39 @@ +--- +title: "Connect Apache Spark" +id: connect-apache-spark +description: "Setup instructions for connecting Apache Spark to dbt Cloud" +sidebar_label: "Connect Apache Spark" +pagination_next: null +--- + + + +:::note +See [Connect Databricks](#connect-databricks) for the Databricks version of this page. +::: + +dbt Cloud supports connecting to an Apache Spark cluster using the HTTP method +or the Thrift method. Note: While the HTTP method can be used to connect to +an all-purpose Databricks cluster, the ODBC method is recommended for all +Databricks connections. For further details on configuring these connection +parameters, please see the [dbt-spark documentation](https://github.com/dbt-labs/dbt-spark#configuring-your-profile). + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Apache Spark-specific configuration](/reference/resource-configs/spark-configs). + + +The following fields are available when creating an Apache Spark connection using the +HTTP and Thrift connection methods: + +| Field | Description | Examples | +| ----- | ----------- | -------- | +| Host Name | The hostname of the Spark cluster to connect to | `yourorg.sparkhost.com` | +| Port | The port to connect to Spark on | 443 | +| Organization | Optional (default: 0) | 0123456789 | +| Cluster | The ID of the cluster to connect to | 1234-567890-abc12345 | +| Connection Timeout | Number of seconds after which to timeout a connection | 10 | +| Connection Retries | Number of times to attempt connecting to cluster before failing | 10 | +| User | Optional | dbt_cloud_user | +| Auth | Optional, supply if using Kerberos | `KERBEROS` | +| Kerberos Service Name | Optional, supply if using Kerberos | `hive` | + + diff --git a/website/docs/docs/cloud/connect-data-platform/connect-databricks.md b/website/docs/docs/cloud/connect-data-platform/connect-databricks.md new file mode 100644 index 00000000000..b66f5890c61 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-databricks.md @@ -0,0 +1,38 @@ +--- +title: "Connect Databricks" +id: connect-databricks +description: "Setup instructions for connecting Databricks to dbt Cloud" +sidebar_label: "Connect Databricks" +--- + +The dbt-databricks adapter is maintained by the Databricks team and is verified by dbt Labs. The Databricks team is committed to supporting and improving the adapter over time, so you can be sure the integrated experience will provide the best of dbt and the best of Databricks. Connecting to Databricks via dbt-spark has been deprecated. + +## About the dbt-databricks adapter + +dbt-databricks is compatible with the following versions of dbt Core in dbt Cloud with varying degrees of functionality. + +| Feature | dbt Versions | +| ----- | ----------- | +| dbt-databricks | Available starting with dbt 1.0 in dbt Cloud| +| Unity Catalog | Available starting with dbt 1.1 | +| Python models | Available starting with dbt 1.3 | + +The dbt-databricks adapter offers: +- **Easier set up** +- **Better defaults:** +The dbt-databricks adapter is more opinionated, guiding users to an improved experience with less effort. Design choices of this adapter include defaulting to Delta format, using merge for incremental models, and running expensive queries with Photon. +- **Support for Unity Catalog:** +Unity Catalog allows Databricks users to centrally manage all data assets, simplifying access management and improving search and query performance. Databricks users can now get three-part data hierarchies – catalog, schema, model name – which solves a longstanding friction point in data organization and governance. + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Databricks-specific configuration](/reference/resource-configs/databricks-configs). + + +To set up the Databricks connection, supply the following fields: + +| Field | Description | Examples | +| ----- | ----------- | -------- | +| Server Hostname | The hostname of the Databricks account to connect to | dbc-a2c61234-1234.cloud.databricks.com | +| HTTP Path | The HTTP path of the Databricks cluster or SQL warehouse | /sql/1.0/warehouses/1a23b4596cd7e8fg | +| Catalog | Name of Databricks Catalog (optional) | Production | + + diff --git a/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md new file mode 100644 index 00000000000..dae0ee1d178 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-redshift-postgresql-alloydb.md @@ -0,0 +1,66 @@ +--- +title: "Connect Redshift, PostgreSQL, and AlloyDB" +id: connect-redshift-postgresql-alloydb +description: "Setup instructions for connecting Redshift, PostgreSQL, and AlloyDBnpm to dbt Cloud" +sidebar_label: "Connect Redshift, PostgreSQL, and AlloyDB" +--- + +The following fields are required when creating a Postgres, Redshift, or AlloyDB connection: + +| Field | Description | Examples | +| ----- | ----------- | -------- | +| Host Name | The hostname of the Postgres, Redshift, or AlloyDB database to connect to. This can either be a hostname or an IP address. | `xxx.us-east-1.amazonaws.com` | +| Port | Usually 5432 (Postgres) or 5439 (Redshift) | `5439` | +| Database | The logical database to connect to and run queries against. | `analytics` | + +**Note**: When you set up a Redshift or Postgres connection in dbt Cloud, SSL-related parameters aren't available as inputs. + + + +### Connecting via an SSH Tunnel + +To connect to a Postgres, Redshift, or AlloyDB instance via an SSH tunnel, select the **Use SSH Tunnel** option when creating your connection. When configuring the tunnel, you must supply the hostname, username, and port for the [bastion server](#about-the-bastion-server-in-aws). + +Once the connection is saved, a public key will be generated and displayed for the Connection. You can copy this public key to the bastion server to authorize dbt Cloud to connect to your database via the bastion server. + + + +#### About the Bastion server in AWS + +
+ What is a Bastion server? +
+
A bastion server in Amazon Web Services (AWS) is a host that allows dbt Cloud to open an SSH connection.

+ +dbt Cloud only sends queries and doesn't transmit large data volumes. This means the bastion server can run on an AWS instance of any size, like a t2.small instance or t2.micro.



+ +Make sure the location of the instance is the same Virtual Private Cloud (VPC) as the Redshift instance, and configure the security group for the bastion server to ensure that it's able to connect to the warehouse port. +
+
+
+ + +### Configuring the Bastion Server in AWS + +To configure the SSH tunnel in dbt Cloud, you'll need to provide the hostname/IP of your bastion server, username, and port, of your choosing, that dbt Cloud will connect to. Review the following steps: + +- Verify the bastion server has its network security rules set up to accept connections from the [dbt Cloud IP addresses](/docs/cloud/about-cloud/regions-ip-addresses) on whatever port you configured. +- Set up the user account by using the bastion servers instance's CLI, The following example uses the username `dbtcloud:` + +```shell +sudo groupadd dbtcloud +sudo useradd -m -g dbtcloud dbtcloud +sudo su - dbtcloud +mkdir ~/.ssh +chmod 700 ~/.ssh +touch ~/.ssh/authorized_keys +chmod 600 ~/.ssh/authorized_keys +``` + +- Copy and paste the dbt Cloud generated public key, into the authorized_keys file. + +The Bastion server should now be ready for dbt Cloud to use as a tunnel into the Redshift environment. + +## Configuration + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Redshift-specific configuration](/reference/resource-configs/redshift-configs). diff --git a/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md new file mode 100644 index 00000000000..62a58f6e1c5 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-snowflake.md @@ -0,0 +1,106 @@ +--- +title: "Connect Snowflake" +id: connect-snowflake +description: "Configure Snowflake connection." +sidebar_label: "Connect Snowflake" +--- + +The following fields are required when creating a Snowflake connection + +| Field | Description | Examples | +| ----- | ----------- | -------- | +| Account | The Snowflake account to connect to. Take a look [here](/docs/core/connect-data-platform/snowflake-setup#account) to determine what the account field should look like based on your region.| | +| Role | A mandatory field indicating what role should be assumed after connecting to Snowflake | `transformer` | +| Database | The logical database to connect to and run queries against. | `analytics` | +| Warehouse | The virtual warehouse to use for running queries. | `transforming` | + + +**Note:** A crucial part of working with dbt atop Snowflake is ensuring that users (in development environments) and/or service accounts (in deployment to production environments) have the correct permissions to take actions on Snowflake! Here is documentation of some [example permissions to configure Snowflake access](/reference/snowflake-permissions). + +### Username / Password + +**Available in:** Development environments, Deployment environments + +The `Username / Password` auth method is the simplest way to authenticate +Development or Deployment credentials in a dbt project. Simply enter your Snowflake +username (specifically, the `login_name`) and the corresponding user's Snowflake `password` +to authenticate dbt Cloud to run queries against Snowflake on behalf of a Snowflake user. + +**Note**: The schema field in the **Developer Credentials** section is a required field. + + +### Key Pair + +**Available in:** Development environments, Deployment environments + +The `Keypair` auth method uses Snowflake's [Key Pair Authentication](https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication) to authenticate Development or Deployment credentials for a dbt Cloud project. + +1. After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: + +```sql +alter user jsmith set rsa_public_key='MIIBIjANBgkqh...'; +``` + +2. Finally, set the **Private Key** and **Private Key Passphrase** fields in the **Credentials** page to finish configuring dbt Cloud to authenticate with Snowflake using a key pair. + + **Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. + +3. To successfully fill in the Private Key field, you **must** include commented lines when you add the passphrase. Leaving the **Private Key Passphrase** field empty will return an error. If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to [Troubleshooting](#troubleshooting) for more info. + +**Example:** + +```sql +-----BEGIN ENCRYPTED PRIVATE KEY----- +< encrypted private key contents here - line 1 > +< encrypted private key contents here - line 2 > +< ... > +-----END ENCRYPTED PRIVATE KEY----- +``` + + + +### Snowflake OAuth + +**Available in:** Development environments, Enterprise plans only + +The OAuth auth method permits dbt Cloud to run development queries on behalf of +a Snowflake user without the configuration of Snowflake password in dbt Cloud. For +more information on configuring a Snowflake OAuth connection in dbt Cloud, please see [the docs on setting up Snowflake OAuth](/docs/cloud/manage-access/set-up-snowflake-oauth). + + +## Configuration + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Snowflake-specific configuration](/reference/resource-configs/snowflake-configs). + +## Troubleshooting + + +If you're receiving a `Could not deserialize key data` or `JWT token` error, refer to the following causes and solutions: + +
+ +Error: Could not deserialize key data + + - Possible cause + + - This could be because of mistakes like not copying correctly, missing dashes, or leaving out commented lines. + - Solution + + - You can copy the key from its source and paste it into a text editor to verify it before using it in dbt Cloud. + +
+ +
+Error: JWT token + + - Possible causes + + - This could be a transient issue between Snowflake and dbt Cloud. When connecting to Snowflake, dbt gets a JWT token valid for only 60 seconds. If there's no response from Snowflake within this time, you might see a `JWT token is invalid` error in dbt Cloud. + - The public key was not entered correctly in Snowflake. + + - Solutions + + - dbt needs to retry connections to Snowflake. + - Confirm and enter Snowflake's public key correctly. Additionally, you can reach out to Snowflake for help or refer to this Snowflake doc for more info: [Key-Based Authentication Failed with JWT token is invalid Error](https://community.snowflake.com/s/article/Key-Based-Authentication-Failed-with-JWT-token-is-invalid-Error). + +
diff --git a/website/docs/docs/cloud/connect-data-platform/connect-starburst-trino.md b/website/docs/docs/cloud/connect-data-platform/connect-starburst-trino.md new file mode 100644 index 00000000000..db0d3f61728 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connect-starburst-trino.md @@ -0,0 +1,28 @@ +--- +title: "Connect Starburst or Trino" +description: "Configure Starburst or Trino connection." +sidebar_label: "Connect Starburst or Trino" +--- + +The following are the required fields for setting up a connection with a [Starburst Enterprise](https://docs.starburst.io/starburst-enterprise/index.html), [Starburst Galaxy](https://docs.starburst.io/starburst-galaxy/index.html), or [Trino](https://trino.io/) cluster. Unless specified, "cluster" means any of these products' clusters. + +| Field | Description | Examples | +| --- | --- | --- | +| **Host** | The hostname of your cluster. Don't include the HTTP protocol prefix. | `mycluster.mydomain.com` | +| **Port** | The port to connect to your cluster. By default, it's 443 for TLS enabled clusters. | `443` | +| **User** | The username (of the account) to log in to your cluster. When connecting to Starburst Galaxy clusters, you must include the role of the user as a suffix to the username.

| Format for Starburst Enterprise or Trino depends on your configured authentication method.
Format for Starburst Galaxy:
  • `user.name@mydomain.com/role`
| +| **Password** | The user's password. | | +| **Database** | The name of a catalog in your cluster. | `example_catalog` | +| **Schema** | The name of a schema that exists within the specified catalog.  | `example_schema` | + +## Roles in Starburst Enterprise + + + +## Catalogs and schemas + + + +## Configuration + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [Starburst/Trino-specific configuration](/reference/resource-configs/trino-configs). diff --git a/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md new file mode 100644 index 00000000000..7ea6e380000 --- /dev/null +++ b/website/docs/docs/cloud/connect-data-platform/connnect-bigquery.md @@ -0,0 +1,50 @@ +--- +title: "Connect BigQuery" +id: connect-bigquery +description: "Configure BigQuery connection." +sidebar_label: "Connect BigQuery" +--- +### JSON keyfile + +:::info Uploading a service account JSON keyfile + +While the fields in a BigQuery connection can be specified manually, we recommend uploading a service account keyfile to quickly and accurately configure a connection to BigQuery. + +::: + +Uploading a JSON keyfile should populate the following fields: +- Project id +- Private key id +- Private key +- Client email +- Client id +- Auth uri +- Token uri +- Auth provider x509 cert url +- Client x509 cert url + +In addition to these fields, there are two other optional fields that can be configured in a BigQuery connection: + +| Field | Description | Examples | +| ----- | ----------- | ------- | +| Timeout | Deprecated; exists for backwards compatibility with older versions of dbt and will be removed in the future. | `300` | +| Location | The [location](https://cloud.google.com/bigquery/docs/locations) where dbt should create datasets. | `US`, `EU` | + + + + + +### BigQuery OAuth +**Available in:** Development environments, Enterprise plans only + +The OAuth auth method permits dbt Cloud to run development queries on behalf of +a BigQuery user without the configuration of BigQuery service account keyfile in dbt Cloud. For +more information on the initial configuration of a BigQuery OAuth connection in dbt Cloud, please see +[the docs on setting up BigQuery OAuth](/docs/cloud/manage-access/set-up-bigquery-oauth). + +As an end user, if your organization has set up BigQuery OAuth, you can link a project with your personal BigQuery account in your personal Profile in dbt Cloud, like so: + + +## Configuration + +To learn how to optimize performance with data platform-specific configurations in dbt Cloud, refer to [BigQuery-specific configuration](/reference/resource-configs/bigquery-configs). diff --git a/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md new file mode 100644 index 00000000000..3c41432bc62 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-ide.md @@ -0,0 +1,37 @@ +--- +title: "dbt Cloud IDE" +description: "Learn how to configure Git in dbt Cloud" +pagination_next: "docs/cloud/dbt-cloud-ide/develop-in-the-cloud" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md new file mode 100644 index 00000000000..39db7832d79 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/dbt-cloud-tips.md @@ -0,0 +1,64 @@ +--- +title: "Tips and tricks" +id: dbt-cloud-tips +description: "Check out any dbt Cloud and IDE-related tips." +sidebar_label: "Tips and tricks" +pagination_next: null +--- + +# dbt Cloud tips + +The Cloud IDE provides keyboard shortcuts, features, and development tips to help you work faster and be more productive. Use this Cloud IDE cheat sheet to help you quickly reference some common operations. + +## Cloud IDE Keyboard shortcuts + +There are default keyboard shortcuts that can help make development more productive and easier for everyone. + +- Press Fn-F1 to view a full list of the editor shortcuts +- Command-O on macOS or Control-O on Windows to select a file to open +- Command-P/Command-Shift-P on macOS or Control-P/Control-Shift-P on Windows to see the command palette +- Hold Option-click-on-area or press Shift-Option-Command on macOS or Hold-Alt-click-on-area on Windows to select multiple lines and perform a multi-edit. You can also press Command-E to perform this operation on the command line. +- Command-Enter on macOS or Control-Enter on Windows to Preview your code +- Command-Shift-Enter on macOS or Control-Shift-Enter on Windows to Compile +- Highlight a portion of code and use the above shortcuts to Preview or Compile code +- Enter two underscores (__) in the IDE to reveal a list of dbt functions +- Press Control-backtick (or Ctrl + `) to toggle the Invocation history +- Press Command-Option-forward slash on macOS or Control-Alt-forward slash on Windows on the selected code to add a block comment. SQL files will use the Jinja syntax `({# #})` rather than the SQL one `(/* */)`. Markdown files will use the Markdown syntax `()` +- Option-W on macOS or Alt-W on Windows will close the currently active editor tab + + +## Package tips + +- Use the [dbt_codegen](https://hub.getdbt.com/dbt-labs/codegen/latest/) package to help you generate YML files for your models and sources and SQL files for your staging models. +- The [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) package contains macros useful for daily development. For example, `date_spine` generates a table with all dates between the ones provided as parameters. +- The [dbt_project_evaluator](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest) package compares your dbt project against a list of our best practices and provides suggestions and guidelines on how to update your models. +- The [dbt_expectations](https://hub.getdbt.com/calogica/dbt_expectations/latest) package contains many tests beyond those built into dbt Core. +- The [dbt_audit_helper](https://hub.getdbt.com/#:~:text=adwords-,audit_helper,-codegen) package lets you compare the output of 2 queries. Use it when refactoring existing logic to ensure that the new results are identical. +- The [dbt_artifacts](https://hub.getdbt.com/brooklyn-data/dbt_artifacts/latest) package saves information about your dbt runs directly to your data platform so that you can track the performance of models over time. +- The [dbt_meta_testing](https://hub.getdbt.com/tnightengale/dbt_meta_testing/latest) package checks that your dbt project is sufficiently tested and documented. + +## Advanced tips + +- Use your folder structure as your primary selector method. `dbt build --select marts.marketing` is simpler and more resilient than relying on tagging every model. +- Think about jobs in terms of build cadences and SLAs. Run models that have hourly, daily, or weekly build cadences together. +- Use the [where config](/reference/resource-configs/where) for tests to test an assertion on a subset of records. +- [store_failures](/reference/resource-configs/store_failures) lets you examine records that cause tests to fail, so you can either repair the data or change the test as needed. +- Use [severity](/reference/resource-configs/severity) thresholds to set an acceptable number of failures for a test. +- Use [incremental_strategy](/docs/build/incremental-models#about-incremental_strategy) in your incremental model config to implement the most effective behavior depending on the volume of your data and reliability of your unique keys. +- Set `vars` in your `dbt_project.yml` to define global defaults for certain conditions, which you can then override using the `--vars` flag in your commands. +- Use [for loops](/guides/advanced/using-jinja#use-a-for-loop-in-models-for-repeated-sql) in Jinja to [DRY](https://docs.getdbt.com/terms/dry) up repetitive logic, such as selecting a series of columns that all require the same transformations and naming patterns to be applied. +- Instead of relying on post-hooks, use the [grants config](/reference/resource-configs/grants) to apply permission grants in the warehouse resiliently. +- Define [source-freshness](/docs/build/sources#snapshotting-source-data-freshness) thresholds on your sources to avoid running transformations on data that has already been processed. +- Use the `+` operator on the left of a model `dbt build --select +model_name` to run a model and all of its upstream dependencies. Use the `+` operator on the right of the model `dbt build --select model_name+` to run a model and everything downstream that depends on it. +- Use `dir_name` to run all models in a package or directory. +- Use the `@` operator on the left of a model in a non-state-aware CI setup to test it. This operator runs all of a selection’s parents and children, and also runs the parents of its children, which in a fresh CI schema will likely not exist yet. +- Use the [--exclude flag](/reference/node-selection/exclude) to remove a subset of models out of a selection. +- Use the [--full-refresh](/reference/commands/run#refresh-incremental-models) flag to rebuild an incremental model from scratch. +- Use [seeds](/docs/build/seeds) to create manual lookup tables, like zip codes to states or marketing UTMs to campaigns. `dbt seed` will build these from CSVs into your warehouse and make them `ref` able in your models. +- Use [target.name](/docs/build/custom-schemas#an-alternative-pattern-for-generating-schema-names) to pivot logic based on what environment you’re using. For example, to build into a single development schema while developing, but use multiple schemas in production. + +## Related docs + +- [Quickstart guide](/quickstarts) +- [About dbt Cloud](/docs/cloud/about-cloud/dbt-cloud-features) +- [Develop in the Cloud](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) diff --git a/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md new file mode 100644 index 00000000000..9fc382f0217 --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/develop-in-the-cloud.md @@ -0,0 +1,173 @@ +--- +title: "About the dbt Cloud IDE" +id: develop-in-the-cloud +description: "Develop, test, run, and build in the Cloud IDE. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly" +sidebar_label: About the IDE +tags: [IDE] +pagination_next: "docs/cloud/dbt-cloud-ide/ide-user-interface" +pagination_prev: null +--- + +The dbt Cloud integrated development environment (IDE) is a single web-based interface for building, testing, running, and version-controlling dbt projects. It compiles dbt code into SQL and executes it directly on your database. + +The dbt Cloud IDE offers several [editing features](/docs/cloud/dbt-cloud-ide/ide-user-interface#editing-features) for faster and more efficient data platform development and governance: + +- Syntax highlighting for SQL: Makes it easy to distinguish different parts of your code, reducing syntax errors and enhancing readability. +- Auto-completion: Suggests table names, arguments, and column names as you type, saving time and reducing typos. +- Code [formatting and linting](/docs/cloud/dbt-cloud-ide/lint-format): Help standardize and fix your SQL code effortlessly. +- Navigation tools: Easily move around your code, jump to specific lines, find and replace text, and navigate between project files. +- Version control: Manage code versions with a few clicks. + +These [features](#dbt-cloud-ide-features) create a powerful editing environment for efficient SQL coding, suitable for both experienced and beginner developers. + + + + + + + + +:::tip Disable ad blockers + +To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. This is because some project file names, such as `google_adwords.sql`, might resemble ad traffic and trigger ad blockers. + +::: + +## Prerequisites + +- A [dbt Cloud account](https://cloud.getdbt.com/) and [Developer seat license](/docs/cloud/manage-access/seats-and-users) +- A git repository set up and git provider must have `write` access enabled. See [Connecting your GitHub Account](/docs/cloud/git/connect-github) or [Importing a project by git URL](/docs/cloud/git/import-a-project-by-git-url) for detailed setup instructions +- A dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections) +- A [development environment and development credentials](#access-the-cloud-ide) set up +- The environment must be on dbt version 1.0 or higher + + +## dbt Cloud IDE features + +The dbt Cloud IDE comes with [tips](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips) and [features](/docs/cloud/dbt-cloud-ide/ide-user-interface) that make it easier for you to develop, build, compile, run, and test data models. + +:::tip Stay informed + +To stay informed on IDE updates, read [dbt Cloud IDE release notes](/tags/ide), or refer to the [IDE user interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) for comprehensive definitions and terminology of user interface elements. + +::: + +| Feature | Info | +|---|---| +| **Keyboard shortcuts** | You can access a variety of [commands and actions](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips#cloud-ide-keyboard-shortcuts) in the IDE by choosing the appropriate keyboard shortcut. This includes the command palette (Command-P or Control-P), which has common shortcuts and build commands to optimize your developer workflow. For example, you can build modified models or continue the build from the last failure point. | +| **File state indicators** | Ability to see when changes or actions have been made to the file. The indicators **M, D, A,** and **•** appear to the right of your file or folder name and indicate the actions performed:

- Unsaved **(•)** — The IDE detects unsaved changes to your file/folder
- Modification **(M)** — The IDE detects a modification of existing files/folders
- Added **(A)** — The IDE detects added files
- Deleted **(D)** — The IDE detects deleted files. +| **IDE version control** | The IDE version control section and git button allow you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project directly into the IDE.

- Create or change branches
- Commit or revert individual files by right-clicking the edited file
- [Resolve merge conflicts](/docs/collaborate/git/merge-conflicts)
- Execute git commands using the git button
- Link to the repo directly by clicking the branch name | +| **Project documentation** | Generate and view your [project documentation](/docs/collaborate/build-and-view-your-docs) for your dbt project in real-time. You can inspect and verify what your project's documentation will look like before you deploy your changes to production. | +| **Preview and Compile button** | You can [compile or preview](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section) code, a snippet of dbt code, or one of your dbt models after editing and saving. | +| **Build, test, and run button** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. +| **Command bar** | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking History on the left of the bar. +| **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right-clicking on the breadcrumb file. +| **Organize tabs and files** | - Move your tabs around to reorganize your work in the IDE
- Right-click on a tab to view and select a list of actions, including duplicate files
- Close multiple, unsaved tabs to batch save your work
- Double click files to rename files | +| **Find and replace** | - Press Command-F or Control-F to open the find-and-replace bar in the upper right corner of the current file in the IDE. The IDE highlights your search results in the current file and code outline
- You can use the up and down arrows to see the match highlighted in the current file when there are multiple matches
- Use the left arrow to replace the text with something else | +| **Multiple selections** | You can make multiple selections for small and simultaneous edits. The below commands are a common way to add more cursors and allow you to insert cursors below or above with ease.

- Option-Command-Down arrow or Ctrl-Alt-Down arrow
- Option-Command-Up arrow or Ctrl-Alt-Up arrow
- Press Option and click on an area or Press Ctrl-Alt and click on an area
+| **Lint and Format** | [Lint and format](/docs/cloud/dbt-cloud-ide/lint-format) your files with a click of a button, powered by SQLFluff, sqlfmt, Prettier, and Black. +| **Git diff view** | Ability to see what has been changed in a file before you make a pull request. +| **dbt autocomplete** | New autocomplete features to help you develop faster:

- Use `ref` to autocomplete your model names
- Use `source` to autocomplete your source name + table name
- Use `macro` to autocomplete your arguments
- Use `env var` to autocomplete env var
- Start typing a hyphen (-) to use in-line autocomplete in a YAML file | +| ** in the IDE** | You can see how models are used as building blocks from left to right to transform your data from raw sources into cleaned-up modular derived pieces and final outputs on the far right of the DAG. The default view is 2+model+2 (defaults to display 2 nodes away), however, you can change it to +model+ (full ). Note the `--exclude` flag isn't supported. | +| **Status bar** | This area provides you with useful information about your IDE and project status. You also have additional options like enabling light or dark mode, restarting the IDE, or [recloning your repo](/docs/collaborate/git/version-control-basics). +| **Dark mode** | From the status bar in the Cloud IDE, enable dark mode for a great viewing experience in low-light environments. + + +### Start-up process + +There are three start-up states when using or launching the Cloud IDE: + +- **Creation start —** This is the state where you are starting the IDE for the first time. You can also view this as a *cold start* (see below), and you can expect this state to take longer because the git repository is being cloned. +- **Cold start —** This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. +- **Hot start —** This is the state of resuming an existing or active develop session within three hours of the last activity. + +### Work retention + +The Cloud IDE needs explicit action to save your changes. There are three ways your work is stored: + +- **Unsaved, local code —** The browser stores your code only in its local storage. In this state, you might need to commit any unsaved changes in order to switch branches or browsers. If you have saved and committed changes, you can access the "Change branch" option even if there are unsaved changes. But if you attempt to switch branches without saving changes, a warning message will appear, notifying you that you will lose any unsaved changes. + + +- **Saved but uncommitted code —** When you save a file, the data gets stored in durable, long-term storage, but isn't synced back to git. To switch branches using the **Change branch** option, you must "Commit and sync" or "Revert" changes. Changing branches isn't available for saved-but-uncommitted code. This is to ensure your uncommitted changes don't get lost. +- **Committed code —** This is stored in the branch with your git provider and you can check out other (remote) branches. + +## Access the Cloud IDE + +:::tip Disable ad blockers + +To improve your experience using dbt Cloud, we suggest that you turn off ad blockers. This is because some project file names, such as `google_adwords.sql`, might resemble ad traffic and trigger ad blockers. + +::: + +In order to start experiencing the great features of the Cloud IDE, you need to first set up a [dbt Cloud development environment](/docs/dbt-cloud-environments). In the following steps, we outline how to set up developer credentials and access the IDE. If you're creating a new project, you will automatically configure this during the project setup. + +The IDE uses developer credentials to connect to your data platform. These developer credentials should be specific to your user and they should *not* be super user credentials or the same credentials that you use for your production deployment of dbt. + +Set up your developer credentials: + +1. Navigate to your **Credentials** under **Your Profile** settings, which you can access at `https://YOUR_ACCESS_URL/settings/profile#credentials`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. +2. Select the relevant project in the list. +3. Click **Edit** on the bottom right of the page. +4. Enter the details under **Development Credentials**. +5. Click **Save.** + + + + +6. Access the Cloud IDE by clicking **Develop** at the top of the page. +7. Initialize your project and familiarize yourself with the IDE and its delightful [features](#cloud-ide-features). + +Nice job, you're ready to start developing and building models 🎉! + +## Build, compile, and run projects + +You can *build*, *compile*, *run*, and *test* dbt projects using the command bar or **Build** button. Use the **Build** button to quickly build, run, or test the model you're working on. The Cloud IDE will update in real-time when you run models, tests, seeds, and operations. + +If a model or test fails, dbt Cloud makes it easy for you to view and download the run logs for your dbt invocations to fix the issue. + +Use dbt's [rich model selection syntax](/reference/node-selection/syntax) to [run dbt commands](/reference/dbt-commands) directly within dbt Cloud. + + + +## Build and view your project's docs + +The dbt Cloud IDE makes it possible to [build and view](/docs/collaborate/build-and-view-your-docs#generating-documentation) documentation for your dbt project while your code is still in development. With this workflow, you can inspect and verify what your project's generated documentation will look like before your changes are released to production. + + +## Related docs + +- [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) +- [IDE user interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) +- [Version control basics](/docs/collaborate/git/version-control-basics) +- [dbt Commands](/reference/dbt-commands) +- [dbt Cloud IDE release notes](/tags/ide) + +## Related questions + +
+ +
+ Is there a cost to using the Cloud IDE? +
+
Not at all! You can use dbt Cloud when you sign up for the Free Developer plan, which comes with one developer seat. If you’d like to access more features or have more developer seats, you can upgrade your account to the Team or Enterprise plan. See dbt pricing plans for more details.
+
+
+
+ Can I be a contributor to dbt Cloud? +
+
Anyone can contribute to the dbt project. And whether it's a dbt package, a plugin, dbt-core, or this documentation site, contributing to the open-source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and give back to the community. See Contributing for details on what to expect when contributing to the dbt open source software (OSS).
+
+
+
+ What is the difference between developing on the dbt Cloud IDE, the dbt Cloud CLI, and dbt Core? +
+
You can develop dbt using the web-based IDE in dbt Cloud or on the command line interface using the dbt Cloud CLI or open-source dbt Core, all of which enable you to execute dbt commands. The key distinction between the dbt Cloud CLI and dbt Core is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its features.

+ + dbt Cloud IDE: dbt Cloud is a web-based application that allows you to develop dbt projects with the IDE, includes a purpose-built scheduler, and provides an easier way to share your dbt documentation with your team. The IDE is a faster and more reliable way to deploy your dbt models and provides a real-time editing and execution environment for your dbt project.

+ + dbt Cloud CLI: The dbt Cloud CLI allows you to run dbt commands against your dbt Cloud development environment from your local command line or code editor. It supports cross-project ref, speedier, lower-cost builds, automatic deferral of build artifacts, and more.

+ + dbt Core: dbt Core is an open-sourced software that’s freely available. You can build your dbt project in a code editor, and run dbt commands from the command line. +
+
+
diff --git a/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md new file mode 100644 index 00000000000..05910b23e7f --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/ide-user-interface.md @@ -0,0 +1,180 @@ +--- +title: "IDE user interface" +id: ide-user-interface +description: "Develop, test, run, and build in the Cloud IDE. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly" +sidebar_label: User interface +tags: [IDE] +--- + +The [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) is a tool for developers to effortlessly build, test, run, and version-control their dbt projects, and enhance data governance — all from the convenience of your browser. Use the Cloud IDE to compile dbt code into SQL and run it against your database directly -- no command line required! + +This page offers comprehensive definitions and terminology of user interface elements, allowing you to navigate the IDE landscape with ease. + + + +## Basic layout + +The IDE streamlines your workflow, and features a popular user interface layout with files and folders on the left, editor on the right, and command and console information at the bottom. + + + +1. **Git repository link —** Clicking the Git repository link, located on the upper left of the IDE, takes you to your repository on the same active branch. + * **Note:** This feature is only available for GitHub or GitLab repositories on multi-tenant dbt Cloud accounts. + +2. **Documentation site button —** Clicking the Documentation site book icon, located next to the Git repository link, leads to the dbt Documentation site. The site is powered by the latest dbt artifacts generated in the IDE using the `dbt docs generate` command from the Command bar. + +3. [**Version Control**](#editing-features) — The IDE's powerful Version Control section contains all git-related elements, including the Git actions button and the **Changes** section. + +4. **File Explorer —** The File Explorer shows the filetree of your repository. You can: + - Click on any file in the filetree to open the file in the File Editor. + - Click and drag files between directories to move files. + - Right-click a file to access the sub-menu options like duplicate file, copy file name, copy as `ref`, rename, delete. + - **Note**: To perform these actions, the user must not be in `read-only` mode, which generally happens when the user is viewing the default branch. + - Use file indicators, located to the right of your files or folder name, to see when changes or actions were made: + * Unsaved (•) — The IDE detects unsaved changes to your file/folder + * Modification (M) — The IDE detects a modification of existing files/folders + * Added (A) — The IDE detects added files + * Deleted (D) — The IDE detects deleted files. + + + +5. **Command bar —** The Command bar, located in the lower left of the IDE, is used to invoke [dbt commands](/reference/dbt-commands). When a command is invoked, the associated logs are shown in the Invocation History Drawer. + +6. **Defer to production —** The **Defer to production** toggle allows developers to only build and run and test models they've edited without having to first run and build all the models that come before them (upstream parents). Refer to [Using defer in dbt Cloud](/docs/cloud/about-cloud-develop-defer#defer-in-the-dbt-cloud-ide) for more info. + +7. **Status button —** The IDE Status button, located on the lower right of the IDE, displays the current IDE status. If there is an error in the status or in the dbt code that stops the project from parsing, the button will turn red and display "Error". If there aren't any errors, the button will display a green "Ready" status. To access the [IDE Status modal](#modals-and-menus), simply click on this button. + + +## Editing features + +The IDE features some delightful tools and layouts to make it easier for you to write dbt code and collaborate with teammates. + + + +1. **File Editor —** The File Editor is where users edit code. Tabs break out the region for each opened file, and unsaved files are marked with a blue dot icon in the tab view. + + * Use intuitive [keyboard shortcuts](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips#cloud-ide-keyboard-shortcuts) to help develop easier for you and your team. + +2. **Save button —** The editor has a **Save** button that saves editable files. Pressing the button or using the Command-S or Control-S shortcut saves the file contents. You don't need to save to preview code results in the Console section, but it's necessary before changes appear in a dbt invocation. The File Editor tab shows a blue icon for unsaved changes. + +3. **Version Control —** This menu contains all git-related elements, including the Git actions button. The button updates relevant actions based on your editor's state, such as prompting to pull remote changes, commit and sync when reverted commit changes are present, or creating a merge/pull request when appropriate. The dropdown menu on the Git actions button allows users to revert changes, refresh git state, create merge/pull requests, and change branches. You can also [resolve merge conflicts](/docs/collaborate/git/merge-conflicts) and for more info on git, refer to [Version control basics](/docs/collaborate/git/version-control-basics#the-git-button-in-the-cloud-ide). + + - **Version Control Options menu —** Below the Git Actions button, the **Changes** section, which lists all file changes since the last commit. You can click on a change to open the Git Diff View to see the inline changes. You can also right-click any file and use the file-specific options in the Version Control Options menu. + + + +## Additional editing features + +- **Minimap —** A Minimap (code outline) gives you a high-level overview of your source code, which is useful for quick navigation and code understanding. A file's minimap is displayed on the upper-right side of the editor. To quickly jump to different sections of your file, click the shaded area. + + +- **dbt Editor Command Palette —** The dbt Editor Command Palette displays text editing actions and their associated keyboard shortcuts. This can be accessed by pressing `F1` or right-clicking in the text editing area and selecting Command Palette. + + +- **Git Diff View —** Clicking on a file in the **Changes** section of the **Version Control Menu** will open the changed file with Git Diff view. The editor will show the previous version on the left and the in-line changes made on the right. + + +- **Markdown Preview console tab —** The Markdown Preview console tab shows a preview of your .md file's markdown code in your repository and updates it automatically as you edit your code. + + +- **CSV Preview console tab —** The CSV Preview console tab displays the data from your CSV file in a table, which updates automatically as you edit the file in your seed directory. + + +## Console section + +The console section, located below the File editor, includes various console tabs and buttons to help you with tasks such as previewing, compiling, building, and viewing the . Refer to the following sub-bullets for more details on the console tabs and buttons. + + +1. **Preview button —** When you click on the Preview button, it runs the SQL in the active file editor regardless of whether you have saved it or not and sends the results to the **Results** console tab. You can preview a selected portion of saved or unsaved code by highlighting it and then clicking the **Preview** button. + +
+Row limits in IDE +The dbt Cloud IDE returns default row limits, however, you can also specify the number of records returned. Refer to the following sub-bullets for more info:

+
    +
  • 500-row limit: To prevent the IDE from returning too much data and causing browser problems, dbt automatically sets a 500-row limit when using the Preview Button. You can modify this by adding limit your_number at the end of your SQL statement. For example, SELECT * FROM table limit 100 will return up to 100 rows. Remember that you must write the limit your_number explicitly and cannot derive it from a macro.
  • +
  • Change row limit default: In dbt version 1.6 or higher, you have the ability to change the default limit of 500 rows shown in the Results tab when you run a query. To adjust the setting you can click on Change row display next to the displayed rows. Keep in mind that you can't set it higher than 10,000 rows. If you refresh the page or close your development session, the default limit will go back to 500 rows.
  • +
  • Specify records returned: The IDE also supports SELECT TOP #, which specifies the number of records to return.
  • +
+
+ +2. **Compile button —** The **Compile** button compiles the saved or unsaved SQL code and displays it in the **Compiled Code** tab. + + +Starting from dbt v1.6 or higher, when you save changes to a model, you can compile its code with the model's specific context. This context is similar to what you'd have when building the model and involves useful context variables like `{{ this }} `or `{{ is_incremental() }}`. + +3. **Build button —** The build button allows users to quickly access dbt commands related to the active model in the File Editor. The available commands include dbt build, dbt test, and dbt run, with options to include only the current resource, the resource and its upstream dependencies, the resource, and its downstream dependencies, or the resource with all dependencies. This menu is available for all executable nodes. + +3. **Format button —** The editor has a **Format** button that can reformat the contents of your files. For SQL files, it uses either `sqlfmt` or `sqlfluff`, and for Python files, it uses `black`. + +5. **Results tab —** The Results console tab displays the most recent Preview results in tabular format. + + +6. **Compiled Code tab —** The Compile button triggers a compile invocation that generates compiled code, which is displayed in the Compiled Code tab. + + +7. **Lineage tab —** The Lineage tab in the File Editor displays the active model's lineage or . By default, it shows two degrees of lineage in both directions (`2+model_name+2`), however, you can change it to +model+ (full DAG). + - Double-click a node in the DAG to open that file in a new tab + - Expand or shrink the DAG using node selection syntax. + - Note, the `--exclude` flag isn't supported. + + + +## Invocation history + +The Invocation History Drawer stores information on dbt invocations in the IDE. When you invoke a command, like executing a dbt command such as `dbt run`, the associated logs are displayed in the Invocation History Drawer. + +You can open the drawer in multiple ways: +- Clicking the `^` icon next to the Command bar on the lower left of the page +- Typing a dbt command and pressing enter +- Or pressing Control-backtick (or Ctrl + `) + + + +1. **Invocation History list —** The left-hand panel of the Invocation History Drawer displays a list of previous invocations in the IDE, including the command, branch name, command status, and elapsed time. + +2. **Invocation Summary —** The Invocation Summary, located above **System Logs**, displays information about a selected command from the Invocation History list, such as the command, its status (`Running` if it's still running), the git branch that was active during the command, and the time the command was invoked. + +3. **System Logs toggle —** The System Logs toggle, located under the Invocation Summary, allows the user to see the full stdout and debug logs for the entirety of the invoked command. + +4. **Command Control button —** Use the Command Control button, located on the right side, to control your invocation and cancel or rerun a selected run. + + + +5. **Node Summary tab —** Clicking on the Results Status Tabs will filter the Node Status List based on their corresponding status. The available statuses are Pass (successful invocation of a node), Warn (test executed with a warning), Error (database error or test failure), Skip (nodes not run due to upstream error), and Queued (nodes that have not executed yet). + +6. **Node result toggle —** After running a dbt command, information about each executed node can be found in a Node Result toggle, which includes a summary and debug logs. The Node Results List lists every node that was invoked during the command. + +7. **Node result list —** The Node result list shows all the Node Results used in the dbt run, and you can filter it by clicking on a Result Status tab. + + +## Modals and Menus +Use menus and modals to interact with IDE and access useful options to help your development workflow. + +- **Editor tab menu —** To interact with open editor tabs, right-click any tab to access the helpful options in the file tab menu. + +- **File Search —** You can easily search for and navigate between files using the File Navigation menu, which can be accessed by pressing Command-O or Control-O or clicking on the 🔍 icon in the File Explorer. + + +- **Global Command Palette—** The Global Command Palette provides helpful shortcuts to interact with the IDE, such as git actions, specialized dbt commands, and compile, and preview actions, among others. To open the menu, use Command-P or Control-P. + + +- **IDE Status modal —** The IDE Status modal shows the current error message and debug logs for the server. This also contains an option to restart the IDE. Open this by clicking on the IDE Status button. + + +- **Commit Changes modal —** The Commit Changes modal is accessible via the Git Actions button to commit all changes or via the Version Control Options menu to commit individual changes. Once you enter a commit message, you can use the modal to commit and sync the selected changes. + + +- **Change Branch modal —** The Change Branch modal allows users to switch git branches in the IDE. It can be accessed through the `Change Branch` link or the Git Actions button in the Version Control menu. + + +- **Revert Uncommitted Changes modal —** The Revert Uncommitted Changes modal is how users revert changes in the IDE. This is accessible via the `Revert File` option above the Version Control Options menu, or via the Git Actions button when there are saved, uncommitted changes in the IDE. + + +- **IDE Options menu —** The IDE Options menu can be accessed by clicking on the three-dot menu located at the bottom right corner of the IDE. This menu contains global options such as: + + * Toggling between dark or light mode for a better viewing experience + * Restarting the IDE + * Fully recloning your repository to refresh your git state and view status details + * Viewing status details, including the IDE Status modal. + + diff --git a/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md new file mode 100644 index 00000000000..6a86f1aa14b --- /dev/null +++ b/website/docs/docs/cloud/dbt-cloud-ide/lint-format.md @@ -0,0 +1,239 @@ +--- +title: "Lint and format your code" +id: "lint-format" +description: Integrate with popular linters and formatters like SQL Fluff, sqlfmt, Black, and Prettier." +sidebar_label: "Lint and format" +tags: [IDE] +--- + +Enhance your development workflow by integrating with popular linters and formatters like [SQLFluff](https://sqlfluff.com/), [sqlfmt](http://sqlfmt.com/), [Black](https://black.readthedocs.io/en/latest/), and [Prettier](https://prettier.io/). Leverage these powerful tools directly in the dbt Cloud IDE without interrupting your development flow. + +
+What are linters and formatters? +Linters analyze code for errors, bugs, and style issues, while formatters fix style and formatting rules. Read more about when to use linters or formatters in the FAQs +
+ + +In the dbt Cloud IDE, you have the capability to perform linting, auto-fix, and formatting on five different file types: + +- SQL — [Lint](#lint) and fix with SQLFluff, and [format](#format) with sqlfmt +- YAML, Markdown, and JSON — Format with Prettier +- Python — Format with Black + +Each file type has its own unique linting and formatting rules. You can [customize](#customize-linting) the linting process to add more flexibility and enhance problem and style detection. + +By default, the IDE uses sqlfmt rules to format your code, making it convenient to use right away. However, if you have a file named `.sqlfluff` in the root directory of your dbt project, the IDE will default to SQLFluff rules instead. + + + + + + + + + + + + + + + +## Lint + +With the dbt Cloud IDE, you can seamlessly use [SQLFluff](https://sqlfluff.com/), a configurable SQL linter, to warn you of complex functions, syntax, formatting, and compilation errors. This integration allows you to run checks, fix, and display any code errors directly within the Cloud IDE: + +- Works with Jinja and SQL, +- Comes with built-in [linting rules](https://docs.sqlfluff.com/en/stable/rules.html). You can also [customize](#customize-linting) your own linting rules. +- Empowers you to [enable linting](#enable-linting) with options like **Lint** (displays linting errors and recommends actions) or **Fix** (auto-fixes errors in the IDE). +- Displays a **Code Quality** tab to view code errors, and provides code quality visibility and management. + +:::info Ephemeral models not supported +Linting doesn't support ephemeral models in dbt v1.5 and lower. Refer to the [FAQs](#faqs) for more info. +::: + +### Enable linting + +1. To enable linting, make sure you're on a development branch. Linting isn't available on main or read-only branches. +2. Open a `.sql` file and click the **Code Quality** tab. +3. Click on the **` Config`** button on the bottom right side of the [console section](/docs/cloud/dbt-cloud-ide/ide-user-interface#console-section), below the **File editor**. +4. In the code quality tool config pop-up, you have the option to select **sqlfluff** or **sqlfmt**. +5. To lint your code, select the **sqlfluff** radio button. (Use sqlfmt to [format](#format) your code) +6. Once you've selected the **sqlfluff** radio button, go back to the console section (below the **File editor**) to select the **Lint** or **Fix** dropdown button: + - **Lint** button — Displays linting issues in the IDE as wavy underlines in the **File editor**. You can hover over an underlined issue to display the details and actions, including a **Quick Fix** option to fix all or specific issues. After linting, you'll see a message confirming the outcome. Linting doesn't rerun after saving. Click **Lint** again to rerun linting. + - **Fix** button — Automatically fixes linting errors in the **File editor**. When fixing is complete, you'll see a message confirming the outcome. + - Use the **Code Quality** tab to view and debug any code errors. + + + +### Customize linting + +SQLFluff is a configurable SQL linter, which means you can configure your own linting rules instead of using the default linting settings in the IDE. You can exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). + +To configure your own linting rules: + +1. Create a new file in the root project directory (the parent or top-level directory for your files). Note: The root project directory is the directory where your `dbt_project.yml` file resides. +2. Name the file `.sqlfluff` (make sure you add the `.` before `sqlfluff`). +3. [Create](https://docs.sqlfluff.com/en/stable/configuration.html#new-project-configuration) and add your custom config code. +4. Save and commit your changes. +5. Restart the IDE. +6. Test it out and happy linting! + +:::tip Configure dbtonic linting rules + +Refer to the [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) to add the dbt code (or dbtonic) rules we use for our own projects: + +
+dbtonic config code example provided by dbt Labs + +``` +[sqlfluff] +templater = dbt +# This change (from jinja to dbt templater) will make linting slower +# because linting will first compile dbt code into data warehouse code. +runaway_limit = 10 +max_line_length = 80 +indent_unit = space + +[sqlfluff:indentation] +tab_space_size = 4 + +[sqlfluff:layout:type:comma] +spacing_before = touch +line_position = trailing + +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = lower + +[sqlfluff:rules:aliasing.table] +aliasing = explicit + +[sqlfluff:rules:aliasing.column] +aliasing = explicit + +[sqlfluff:rules:aliasing.expression] +allow_scalar = False + +[sqlfluff:rules:capitalisation.identifiers] +extended_capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.functions] +capitalisation_policy = lower + +[sqlfluff:rules:capitalisation.literals] +capitalisation_policy = lower + +[sqlfluff:rules:ambiguous.column_references] # Number in group by +group_by_and_order_by_style = implicit +``` +
+ +For more info on styling best practices, refer to [How we style our SQL](/guides/best-practices/how-we-style/2-how-we-style-our-sql). +::: + + + +## Format + +In the dbt Cloud IDE, you can format your code to match style guides with a click of a button. The IDE integrates with formatters like sqlfmt, Prettier, and Black to automatically format code on five different file types — SQL, YAML, Markdown, Python, and JSON: + +- SQL — Format with [sqlfmt](http://sqlfmt.com/), which provides one way to format your dbt SQL and Jinja. +- YAML, Markdown, and JSON — Format with [Prettier](https://prettier.io/). +- Python — Format with [Black](https://black.readthedocs.io/en/latest/). + +The Cloud IDE formatting integrations take care of manual tasks like code formatting, enabling you to focus on creating quality data models, collaborating, and driving impactful results. + +### Format SQL + +To format your SQL code, dbt Cloud integrates with [sqlfmt](http://sqlfmt.com/), which is an uncompromising SQL query formatter that provides one way to format the SQL query and Jinja. + +By default, the IDE uses sqlfmt rules to format your code, making the **Format** button available and convenient to use right away. However, if you have a file named .sqlfluff in the root directory of your dbt project, the IDE will default to SQLFluff rules instead. + +To enable sqlfmt: + +1. Make sure you're on a development branch. Formatting isn't available on main or read-only branches. +2. Open a `.sql` file and click on the **Code Quality** tab. +3. Click on the **` Config`** button on the right side of the console. +4. In the code quality tool config pop-up, you have the option to select sqlfluff or sqlfmt. +5. To format your code, select the **sqlfmt** radio button. (Use sqlfluff to [lint](#linting) your code). +6. Once you've selected the **sqlfmt** radio button, go to the console section (located below the **File editor**) to select the **Format** button. +7. The **Format** button auto-formats your code in the **File editor**. Once you've auto-formatted, you'll see a message confirming the outcome. + + + +### Format YAML, Markdown, JSON + +To format your YAML, Markdown, or JSON code, dbt Cloud integrates with [Prettier](https://prettier.io/), which is an opinionated code formatter. + +1. To enable formatting, make sure you're on a development branch. Formatting isn't available on main or read-only branches. +2. Open a `.yml`, `.md`, or `.json` file. +3. In the console section (located below the **File editor**), select the **Format** button to auto-format your code in the **File editor**. Use the **Code Quality** tab to view code errors. +4. Once you've auto-formatted, you'll see a message confirming the outcome. + + + + +You can add a configuration file to customize formatting rules for YAML, Markdown, or JSON files using Prettier. The IDE looks for the configuration file based on an order of precedence. For example, it first checks for a "prettier" key in your `package.json` file. + +For more info on the order of precedence and how to configure files, refer to [Prettier's documentation](https://prettier.io/docs/en/configuration.html). Please note, `.prettierrc.json5`, `.prettierrc.js`, and `.prettierrc.toml` files aren't currently supported. + +### Format Python + +To format your Python code, dbt Cloud integrates with [Black](https://black.readthedocs.io/en/latest/), which is an uncompromising Python code formatter. + +1. To enable formatting, make sure you're on a development branch. Formatting isn't available on main or read-only branches. +2. Open a `.py` file. +3. In the console section (located below the **File editor**), select the **Format** button to auto-format your code in the **File editor**. +4. Once you've auto-formatted, you'll see a message confirming the outcome. + + + +## FAQs + +
+When should I use SQLFluff and when should I use sqlfmt? + +SQLFluff and sqlfmt are both tools used for formatting SQL code, but there are some differences that may make one preferable to the other depending on your use case.
+ +SQLFluff is a SQL code linter and formatter. This means that it analyzes your code to identify potential issues and bugs, and follows coding standards. It also formats your code according to a set of rules, which are [customizable](#customize-linting), to ensure consistent coding practices. You can also use SQLFluff to keep your SQL code well-formatted and follow styling best practices.
+ +sqlfmt is a SQL code formatter. This means it automatically formats your SQL code according to a set of formatting rules that aren't customizable. It focuses solely on the appearance and layout of the code, which helps ensure consistent indentation, line breaks, and spacing. sqlfmt doesn't analyze your code for errors or bugs and doesn't look at coding issues beyond code formatting.
+ +You can use either SQLFluff or sqlfmt depending on your preference and what works best for you: + +- Use SQLFluff to have your code linted and formatted (meaning analyze fix your code for errors/bugs, and format your styling). It allows you the flexibility to customize your own rules. + +- Use sqlfmt to only have your code well-formatted without analyzing it for errors and bugs. You can use sqlfmt out of the box, making it convenient to use right away without having to configure it. + +
+ +
+Can I nest .sqlfluff files? + +To ensure optimal code quality, consistent code, and styles — it's highly recommended you have one main `.sqlfluff` configuration file in the root folder of your project. Having multiple files can result in various different SQL styles in your project.

+ +However, you can customize and include an additional child `.sqlfluff` configuration file within specific subfolders of your dbt project.

By nesting a `.sqlfluff` file in a subfolder, SQLFluff will apply the rules defined in that subfolder's configuration file to any files located within it. The rules specified in the parent `.sqlfluff` file will be used for all other files and folders outside of the subfolder. This hierarchical approach allows for tailored linting rules while maintaining consistency throughout your project. Refer to [SQLFluff documentation](https://docs.sqlfluff.com/en/stable/configuration.html#configuration-files) for more info. + +
+ +
+Can I run SQLFluff commands from the terminal? + +Currently, running SQLFluff commands from the terminal isn't supported. +
+ +
+Why am I unable to see the Lint or Format button? + +Make sure you're on a development branch. Formatting or Linting isn't available on "main" or "read-only" branches. +
+ +
+Why is there inconsistent SQLFluff behavior when running outside the dbt Cloud IDE (such as a GitHub Action)? +— Double-check your SQLFluff version matches the one in dbt Cloud IDE (found in the Code Quality tab after a lint operation).

+— If your lint operation passes despite clear rule violations, confirm you're not linting models with ephemeral models. Linting doesn't support ephemeral models in dbt v1.5 and lower. +
+ +## Related docs + +- [User interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) +- [Tips and tricks](/docs/cloud/dbt-cloud-ide/dbt-cloud-tips) diff --git a/website/docs/docs/collaborate/git/authenticate-azure.md b/website/docs/docs/cloud/git/authenticate-azure.md similarity index 73% rename from website/docs/docs/collaborate/git/authenticate-azure.md rename to website/docs/docs/cloud/git/authenticate-azure.md index abac4fd1b59..42028bf993b 100644 --- a/website/docs/docs/collaborate/git/authenticate-azure.md +++ b/website/docs/docs/cloud/git/authenticate-azure.md @@ -3,10 +3,11 @@ title: "Authenticate with Azure DevOps" id: "authenticate-azure" description: "dbt Cloud developers need to authenticate with Azure DevOps." sidebar_label: "Authenticate with Azure DevOps" +pagination_next: null --- -If you use the dbt Cloud IDE to collaborate on your team's Azure DevOps dbt repo, you need to [link your dbt Cloud profile to Azure DevOps](#link-your-dbt-cloud-profile-to-azure-devops), which provides an extra layer of authentication. +If you use the dbt Cloud IDE or dbt Cloud CLI to collaborate on your team's Azure DevOps dbt repo, you need to [link your dbt Cloud profile to Azure DevOps](#link-your-dbt-cloud-profile-to-azure-devops), which provides an extra layer of authentication. ## Link your dbt Cloud profile to Azure DevOps @@ -22,3 +23,8 @@ Connect your dbt Cloud profile to Azure DevOps using OAuth: You will be directed back to dbt Cloud, and your profile should be linked. You are now ready to develop in dbt Cloud! + +## FAQs + + + diff --git a/website/docs/docs/collaborate/git/connect-azure-devops.md b/website/docs/docs/cloud/git/connect-azure-devops.md similarity index 57% rename from website/docs/docs/collaborate/git/connect-azure-devops.md rename to website/docs/docs/cloud/git/connect-azure-devops.md index 22ecd12bbbf..c138e042abc 100644 --- a/website/docs/docs/collaborate/git/connect-azure-devops.md +++ b/website/docs/docs/cloud/git/connect-azure-devops.md @@ -1,9 +1,10 @@ --- title: "Connect to Azure DevOps" id: "connect-azure-devops" +pagination_next: "docs/cloud/git/setup-azure" --- - + ## About Azure DevOps and dbt Cloud @@ -13,13 +14,14 @@ Connect your Azure DevOps cloud account in dbt Cloud to unlock new product exper - Import new Azure DevOps repos with a couple clicks during dbt Cloud project setup. - Clone repos using HTTPS rather than SSH - Enforce user authorization with OAuth 2.0. -- Carry Azure DevOps user repository permissions (read / write access) through to dbt Cloud IDE's git actions. +- Carry Azure DevOps user repository permissions (read / write access) through to dbt Cloud IDE or dbt Cloud CLI's git actions. - Trigger Continuous integration (CI) builds when pull requests are opened in Azure DevOps. To connect Azure DevOps in dbt Cloud: -1. An account admin needs to [set up an Active Directory application and add it to dbt Cloud](/docs/collaborate/git/setup-azure). -2. dbt Cloud developers need to [personally authenticate with Azure DevOps](/docs/collaborate/git/authenticate-azure) from dbt Cloud. +1. An account admin needs to [set up an Active Directory application and add it to dbt Cloud](/docs/cloud/git/setup-azure). +2. dbt Cloud developers need to [personally authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure) from dbt Cloud. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Azure DevOps CIDRs to your IP restriction rules, or else the Azure DevOps connection will fail. diff --git a/website/docs/docs/cloud/git/connect-github.md b/website/docs/docs/cloud/git/connect-github.md new file mode 100644 index 00000000000..ff0f2fff18f --- /dev/null +++ b/website/docs/docs/cloud/git/connect-github.md @@ -0,0 +1,82 @@ +--- +title: "Connect to GitHub" +description: "Learn how connecting your GitHub account provides convenience and another layer of security to dbt Cloud." +id: "connect-github" +sidebar_label: "Connect to GitHub" +--- + + +Connecting your GitHub account to dbt Cloud provides convenience and another layer of security to dbt Cloud: +- Log into dbt Cloud using OAuth through GitHub. +- Import new GitHub repositories with a couple clicks during dbt Cloud project setup. +- Clone repos using HTTPS rather than SSH. +- Trigger [Continuous integration](/docs/deploy/continuous-integration)(CI) builds when pull requests are opened in GitHub. + +## Prerequisites + +- For On-Premises GitHub deployment, reference [importing a project by git URL](/docs/cloud/git/import-a-project-by-git-url) to set up your connection instead. Some git features are [limited](/docs/cloud/git/import-a-project-by-git-url#limited-integration) with this setup. + * **Note** — [Single tenant](/docs/cloud/about-cloud/tenancy#single-tenant) accounts offer enhanced connection options for integrating with an On-Premises GitHub deployment setup using the native integration. This integration allows you to use all the features of the integration, such as triggering CI builds. The dbt Labs infrastructure team will coordinate with you to ensure any additional networking configuration requirements are met and completed. To discuss details, contact dbt Labs support or your dbt Cloud account team. +- You _must_ be a **GitHub organization owner** in order to [install the dbt Cloud application](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) in your GitHub organization. To learn about GitHub organization roles, see the [GitHub documentation](https://docs.github.com/en/organizations/managing-peoples-access-to-your-organization-with-roles/roles-in-an-organization). +- The GitHub organization owner requires [_Owner_](/docs/cloud/manage-access/self-service-permissions) or [_Account Admin_](/docs/cloud/manage-access/enterprise-permissions) permissions when they log into dbt Cloud to integrate with a GitHub environment using organizations. +- You may need to temporarily provide an extra dbt Cloud user account with _Owner_ or _Account Admin_ [permissions](/docs/cloud/manage-access/self-service-permissions) for your GitHub organization owner until they complete the installation. + + +## Installing dbt Cloud in your GitHub account + +You can connect your dbt Cloud account to GitHub by installing the dbt Cloud application in your GitHub organization and providing access to the appropriate repositories. +To connect your dbt Cloud account to your GitHub account: + +1. Navigate to **Your Profile** settings by clicking the gear icon in the top right. + +2. Select **Linked Accounts** from the left menu. + + + +3. In the **Linked Accounts** section, set up your GitHub account connection to dbt Cloud by clicking **Link** to the right of GitHub. This redirects you to your account on GitHub where you will be asked to install and configure the dbt Cloud application. + +4. Select the GitHub organization and repositories dbt Cloud should access. + + + +5. Assign the dbt Cloud GitHub App the following permissions: + - Read access to metadata + - Read and write access to Checks + - Read and write access to Commit statuses + - Read and write access to Contents (Code) + - Read and write access to Pull requests + - Read and write access to Webhooks + - Read and write access to Workflows + +6. Once you grant access to the app, you will be redirected back to dbt Cloud and shown a linked account success state. You are now personally authenticated. +7. Ask your team members to [personally authenticate](/docs/cloud/git/connect-github#personally-authenticate-with-github) by connecting their GitHub profiles. + +## Limiting repository access in GitHub +If you are your GitHub organization owner, you can also configure the dbt Cloud GitHub application to have access to only select repositories. This configuration must be done in GitHub, but we provide an easy link in dbt Cloud to start this process. + + +## Personally authenticate with GitHub + +Once the dbt Cloud admin has [set up a connection](/docs/cloud/git/connect-github#installing-dbt-cloud-in-your-github-account) to your organization GitHub account, you need to personally authenticate, which improves the security of dbt Cloud by enabling you to log in using OAuth through GitHub. +:::infoGitHub profile connection +- dbt Cloud developers on the [Enterprise plan](https://www.getdbt.com/pricing/) must each connect their GitHub profiles to dbt Cloud. This is because the dbt Cloud IDE verifies every developer's read / write access for the dbt repo. + +- dbt Cloud developers on the [Team plan](https://www.getdbt.com/pricing/) don't need to each connect their profiles to GitHub, however, it's still recommended to do so. +::: + +To connect a personal GitHub account: + +1. Navigate to **Your Profile** settings by clicking the gear icon in the top right. + +2. Select **Linked Accounts** in the left menu. If your GitHub account is not connected, you’ll see "No connected account". + +3. Select **Link** to begin the setup process. You’ll be redirected to GitHub, and asked to authorize dbt Cloud in a grant screen. + + +4. Once you approve authorization, you will be redirected to dbt Cloud, and you should now see your connected account. + +The next time you log into dbt Cloud, you will be able to do so via OAuth through GitHub, and if you're on the Enterprise plan, you're ready to use the dbt Cloud IDE or dbt Cloud CLI. + + +## FAQs + + diff --git a/website/docs/docs/collaborate/git/connect-gitlab.md b/website/docs/docs/cloud/git/connect-gitlab.md similarity index 85% rename from website/docs/docs/collaborate/git/connect-gitlab.md rename to website/docs/docs/cloud/git/connect-gitlab.md index 3ff5d6f4e56..e55552e2d86 100644 --- a/website/docs/docs/collaborate/git/connect-gitlab.md +++ b/website/docs/docs/cloud/git/connect-gitlab.md @@ -1,15 +1,15 @@ --- title: "Connect to GitLab" +description: "Learn how connecting your GitLab account provides convenience and another layer of security to dbt Cloud." id: "connect-gitlab" --- -## Overview Connecting your GitLab account to dbt Cloud provides convenience and another layer of security to dbt Cloud: - Import new GitLab repos with a couple clicks during dbt Cloud project setup. - Clone repos using HTTPS rather than SSH. -- Carry GitLab user permissions through to dbt Cloud IDE's git actions. -- Trigger [Continuous integration](/docs/deploy/cloud-ci-job) builds when merge requests are opened in GitLab. +- Carry GitLab user permissions through to dbt Cloud or dbt Cloud CLI's git actions. +- Trigger [Continuous integration](/docs/deploy/continuous-integration) builds when merge requests are opened in GitLab. The steps to integrate GitLab in dbt Cloud depend on your plan. If you are on: - the Developer or Team plan, read these [instructions](#for-dbt-cloud-developer-and-team-tiers). @@ -35,7 +35,7 @@ Once you've accepted, you should be redirected back to dbt Cloud, and you'll see dbt Cloud enterprise customers have the added benefit of bringing their own GitLab OAuth application to dbt Cloud. This tier benefits from extra security, as dbt Cloud will: - Enforce user authorization with OAuth. -- Carry GitLab's user repository permissions (read / write access) through to dbt Cloud IDE's git actions. +- Carry GitLab's user repository permissions (read / write access) through to dbt Cloud or dbt Cloud CLI's git actions. In order to connect GitLab in dbt Cloud, a GitLab account admin must: 1. [Set up a GitLab OAuth application](#setting-up-a-gitlab-oauth-application). @@ -59,12 +59,11 @@ In GitLab, when creating your Group Application, input the following: | Field | Value | | ------ | ----- | | **Name** | dbt Cloud | -| **Redirect URI** | https://cloud.getdbt.com/complete/gitlab | +| **Redirect URI** | `https://YOUR_ACCESS_URL/complete/gitlab` | | **Confidential** | ✔️ | | **Scopes** | ✔️ api | -dbt Cloud single tenant customers need to replace **cloud.getdbt.com** with the hostname of -your dbt Cloud instance. +Replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. The application form in GitLab should look as follows when completed: @@ -72,6 +71,8 @@ The application form in GitLab should look as follows when completed: Click **Save application** in GitLab, and GitLab will then generate an **Application ID** and **Secret**. These values will be available even if you close the app screen, so this is not the only chance you have to save them. +If you're a Business Critical customer using [IP restrictions](/docs/cloud/secure/ip-restrictions), ensure you've added the appropriate Gitlab CIDRs to your IP restriction rules, or else the Gitlab connection will fail. + ### Adding the GitLab OAuth application to dbt Cloud After you've created your GitLab application, you need to provide dbt Cloud information about the app. In dbt Cloud, account admins should navigate to **Account Settings**, click on the **Integrations** tab, and expand the GitLab section. @@ -96,7 +97,7 @@ You will then be redirected to GitLab and prompted to sign into your account. Gi Once you've accepted, you should be redirected back to dbt Cloud, and your integration is ready for developers on your team to [personally authenticate with](#personally-authenticating-with-gitlab). ### Personally authenticating with GitLab -dbt Cloud developers on the Enterprise plan must each connect their GitLab profiles to dbt Cloud, as every developer's read / write access for the dbt repo is checked in the dbt Cloud IDE. +dbt Cloud developers on the Enterprise plan must each connect their GitLab profiles to dbt Cloud, as every developer's read / write access for the dbt repo is checked in the dbt Cloud IDE or dbt Cloud CLI. To connect a personal GitLab account, dbt Cloud developers should navigate to Your Profile settings by clicking the gear icon in the top right, then select **Linked Accounts** in the left menu. @@ -104,7 +105,7 @@ If your GitLab account is not connected, you’ll see "No connected account". Se -Once you approve authorization, you will be redirected to dbt Cloud, and you should see your connected account. You're now ready to start developing in the dbt Cloud IDE. +Once you approve authorization, you will be redirected to dbt Cloud, and you should see your connected account. You're now ready to start developing in the dbt Cloud IDE or dbt Cloud CLI. ## Troubleshooting @@ -117,3 +118,10 @@ If you do see your repository listed, but are unable to import the repository su - You are a maintainer of that repository. Only users with maintainer permissions can set up repository connections. If you imported a repository using the dbt Cloud native integration with GitLab, you should be able to see the clone strategy is using a `deploy_token`. If it's relying on an SSH key, this means the repository was not set up using the native GitLab integration, but rather using the generic git clone option. The repository must be reconnected in order to get the benefits described above. + +## FAQs + + + + + diff --git a/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md b/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md new file mode 100644 index 00000000000..fb8c0186236 --- /dev/null +++ b/website/docs/docs/cloud/git/git-configuration-in-dbt-cloud.md @@ -0,0 +1,37 @@ +--- +title: "Git configuration in dbt Cloud" +description: "Learn about the Git providers supported in dbt Cloud" +pagination_next: "docs/cloud/git/import-a-project-by-git-url" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/git/import-a-project-by-git-url.md b/website/docs/docs/cloud/git/import-a-project-by-git-url.md new file mode 100644 index 00000000000..83846bb1f0b --- /dev/null +++ b/website/docs/docs/cloud/git/import-a-project-by-git-url.md @@ -0,0 +1,133 @@ +--- +title: "Import a project by git URL" +id: "import-a-project-by-git-url" +pagination_next: "docs/cloud/git/connect-github" +pagination_prev: null +--- + +In dbt Cloud, you can import a git repository from any valid git URL that points to a dbt project. There are some important considerations to keep in mind when doing this. + +## Git protocols +You must use the `git@...` or `ssh:..`. version of your git URL, not the `https://...` version. dbt Cloud uses the SSH protocol to clone repositories, so dbt Cloud will be unable to clone repos supplied with the HTTP protocol. + + +## Managing Deploy Keys + +After importing a project by Git URL, dbt Cloud will generate a Deploy Key for your repository. To find the deploy key in dbt Cloud: + +1. Click the gear icon in the upper right-hand corner. +2. Click **Account Settings** --> **Projects** and select a project. +3. Click the **Repository** link to the repository details page. +4. Copy the key under the **Deploy Key** section. + +You must provide this Deploy Key in the Repository configuration of your Git host. Configure this Deploy Key to allow *read and write access* to the specified repositories. + +**Note**: Each dbt Cloud project will generate a different deploy key when connected to a repo, even if two projects are connected to the same repo. You will need to supply both deploy keys to your Git provider. + +## GitHub + +:::info Use GitHub? + +If you use GitHub, you can import your repo directly using [dbt Cloud's GitHub Application](/docs/cloud/git/connect-github). Connecting your repo via the GitHub Application [enables Continuous Integration](/docs/deploy/continuous-integration). + +::: + +- To add a deploy key to a GitHub account, navigate to the Deploy keys tab of the settings page in your GitHub repository. +- After supplying a name for the deploy key and pasting in your deploy key (generated by dbt Cloud), be sure to check the **Allow write access** checkbox. +- After adding this key, dbt Cloud will be able to read and write files in your dbt project. +- Refer to [Adding a deploy key in GitHub](https://github.blog/2015-06-16-read-only-deploy-keys/) + + + +## GitLab + +:::info Use GitLab? + +If you use GitLab, you can import your repo directly using [dbt Cloud's GitLab Application](/docs/cloud/git/connect-gitlab). Connecting your repo via the GitLab Application [enables Continuous Integration](/docs/deploy/continuous-integration). + +::: + +- To add a deploy key to a GitLab account, navigate to the [SSH keys](https://gitlab.com/profile/keys) tab in the User Settings page of your GitLab account. +- Next, paste in the deploy key generated by dbt Cloud for your repository. +- After saving this SSH key, dbt Cloud will be able to read and write files in your GitLab repository. +- Refer to [Adding a read only deploy key in GitLab](https://docs.gitlab.com/ee/ssh/#per-repository-deploy-keys) + + + +## BitBucket + +- To add a deploy key to a BitBucket account, navigate to **SSH keys** tab in the Personal Settings page of your BitBucket account. +- Next, click the **Add key** button and paste in the deploy key generated by dbt Cloud for your repository. +- After saving this SSH key, dbt Cloud will be able to read and write files in your BitBucket repository. + + + +## AWS CodeCommit + +dbt Cloud can work with dbt projects hosted on AWS CodeCommit, but there are some extra steps needed compared to Github or other git providers. This guide will help you connect your CodeCommit-hosted dbt project to dbt Cloud. + +#### Step 1: Create an AWS User for dbt Cloud +- To give dbt Cloud access to your repository, first you'll need to create an AWS IAM user for dbt Cloud. +- Log into the AWS Console and navigate to the IAM section. +- Click **Add User**, and create a new user by entering a unique and meaningful user name. +- The user will need clone access to your repository. You can do this by adding the **AWSCodeCommitPowerUser** permission during setup. + +#### Step 2: Import your repository by name +- Open the AWS CodeCommit console and choose your repository. +- Copy the SSH URL from that page. +- Next, navigate to the **New Repository** page in dbt Cloud. +- Choose the **Git Clone** tab, and paste in the SSH URL you copied from the console. +- In the newly created Repository details page, you'll see a **Deploy Key** field. Copy the contents of this field as you'll need it for [Step 3](#step-3-grant-dbt-cloud-aws-user-access) + +**Note:** The dbt Cloud-generated public key is the only key that will work in the next step. Any other key that has been generated outside of dbt Cloud will not work. + +#### Step 3: Grant dbt Cloud AWS User access +- Open up the newly created dbt Cloud user in the AWS IAM Console. +- Choose the **Security Credentials** tab and then click **Upload SSH public key**. +- Paste in the contents of the **Public Key** field from the dbt Cloud Repository page. +- Once you've created the key, you'll see an **SSH key ID** for it. +- **[Contact dbt Support](mailto:support@getdbt.com)** and share that field so that dbt Support team can complete the setup process for you. + +#### Step 4: Specify a custom branch in dbt Cloud + +CodeCommit uses `master` as its default branch, and to initialize your project, you'll need to specify the `master` branch as a [custom branch](/faqs/environments/custom-branch-settings#development) in dbt Cloud. + +- Go to **Deploy** -> **Environments** -> **Development**. +- Select **Settings** -> **Edit** and under **General Settings**, check the **Default to a custom branch** checkbox. +- Specify the custom branch as `master` and click **Save**. + +🎉 **You're all set!** Once dbt Support handles your request and you've set your custom branch, your project is ready to execute dbt runs on dbt Cloud. + +## Azure DevOps +:::info Use Azure DevOps? + +If you use Azure DevOps and you are on the dbt Cloud Enterprise plan, you can import your repo directly using [dbt Cloud's Azure DevOps Integration](/docs/cloud/git/connect-azure-devops). Connecting your repo via the Azure DevOps Application [enables Continuous Integration](/docs/deploy/continuous-integration). + +::: + +1. To add a deploy key to an Azure DevOps account, navigate to the **SSH public keys** page in the User Settings of your user's Azure DevOps account or a service user's account. + +2. We recommend using a dedicated service user for the integration to ensure that dbt Cloud's connection to Azure DevOps is not interrupted by changes to user permissions. + + + +3. Next, click the **+ New Key** button to create a new SSH key for the repository. + + + +4. Select a descriptive name for the key and then paste in the deploy key generated by dbt Cloud for your repository. + +5. After saving this SSH key, dbt Cloud will be able to read and write files in your Azure DevOps repository. + + + +## Other git providers + +Don't see your git provider here? Please [contact dbt Support](mailto:support@getdbt.com) - we're happy to help you set up dbt Cloud with any supported git provider. + +## Limited integration +Some features of dbt Cloud require a tight integration with your git host, for example, updating GitHub pull requests with dbt Cloud run statuses. Importing your project by a URL prevents you from using these features. Once you give dbt Cloud access to your repository, you can continue to set up your project by adding a connection and creating and running your first dbt Cloud job. + +## FAQs + + diff --git a/website/docs/docs/collaborate/git/setup-azure.md b/website/docs/docs/cloud/git/setup-azure.md similarity index 60% rename from website/docs/docs/collaborate/git/setup-azure.md rename to website/docs/docs/cloud/git/setup-azure.md index a4aa73b6aef..843371be6ea 100644 --- a/website/docs/docs/collaborate/git/setup-azure.md +++ b/website/docs/docs/cloud/git/setup-azure.md @@ -5,23 +5,23 @@ description: "You can set up your Azure DevOps by creating an Azure AD app and a sidebar_label: "Set up Azure DevOps" --- - + ## Overview -To use our native integration with Azure DevOps in dbt Cloud, an account admin needs to set up an Azure Active Directory (Azure AD) app. We recommend setting up a separate [Azure AD application than used for SSO](/docs/collaborate/manage-access/set-up-sso-azure-active-directory). +To use our native integration with Azure DevOps in dbt Cloud, an account admin needs to set up an Azure Active Directory (Azure AD) app. We recommend setting up a separate [Azure AD application than used for SSO](/docs/cloud/manage-access/set-up-sso-azure-active-directory). 1. [Register an Azure AD app](#register-an-azure-ad-app). 2. [Add permissions to your new app](#add-permissions-to-your-new-app). -3. [Add another redirect URI](#add-another-redirect-URI). +3. [Add another redirect URI](#add-another-redirect-uri). 4. [Connect Azure DevOps to your new app](#connect-azure-devops-to-your-new-app). 5. [Add your Azure AD app to dbt Cloud](#add-your-azure-ad-app-to-dbt-cloud). -Once the the Azure AD app is added to dbt Cloud, an account admin must also connect a service user via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. +Once the Azure AD app is added to dbt Cloud, an account admin must also connect a service user via OAuth, which will be used to power headless actions in dbt Cloud such as deployment runs and CI. 1. [Connecting a Service User](#connecting-a-service-user). -Once the Azure AD app is added to dbt Cloud and the service user is connected, then dbt Cloud developers can personally authenticate in dbt Cloud from Azure DevOps. For more on this, see [Authenticate with Azure DevOps](/docs/collaborate/git/authenticate-azure). +Once the Azure AD app is added to dbt Cloud and the service user is connected, then dbt Cloud developers can personally authenticate in dbt Cloud from Azure DevOps. For more on this, see [Authenticate with Azure DevOps](/docs/cloud/git/authenticate-azure). ## Register an Azure AD app @@ -31,12 +31,12 @@ Once the Azure AD app is added to dbt Cloud and the service user is connected, t 4. Provide a name for your app. We recommend using, "dbt Labs Azure DevOps App". 5. Select **Accounts in any organizational directory (Any Azure AD directory - Multitenant)** as the Supported Account Types. Many customers ask why they need to select Multitenant instead of Single tenant, and they frequently get this step wrong. Microsoft considers Azure DevOps (formerly called Visual Studio) and Azure Active Directory as separate tenants, and in order for this Active Directory application to work properly, you must select Multitenant. -6. Add a redirect URI by selecting **Web** and typing in `https://cloud.getdbt.com/complete/azure_active_directory`. If you have a custom dbt Cloud URL be sure to use the appropriate domain. +6. Add a redirect URI by selecting **Web** and, in the field, entering `https://YOUR_ACCESS_URL/complete/azure_active_directory`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. 7. Click **Register**. -Here's what your app should look before registering it: +Here's what your app should look like before registering it: @@ -59,8 +59,8 @@ You also need to add another redirect URI to your Azure AD application. This red 1. Navigate to your Azure AD application. 2. Select the link next to **Redirect URIs** -3. Click **Add URI** and add the URI, making sure to use the appropriate domain if you have a custom dbt Cloud URL: -`https://cloud.getdbt.com/complete/azure_active_directory_service_user` +3. Click **Add URI** and add the URI, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: +`https://YOUR_ACCESS_URL/complete/azure_active_directory_service_user` 4. Click **Save**. @@ -93,13 +93,18 @@ Once you connect your Azure AD app and Azure DevOps, you need to provide dbt Clo - **Directory(tenant) ID:** Found in the Azure AD App. -Your Azure AD app should now be added to your dbt Cloud Account. People on your team who want to develop in dbt Cloud's IDE can now personally [authorize Azure DevOps from their profiles](/docs/collaborate/git/authenticate-azure). +Your Azure AD app should now be added to your dbt Cloud Account. People on your team who want to develop in the dbt Cloud IDE or dbt Cloud CLI can now personally [authorize Azure DevOps from their profiles](/docs/cloud/git/authenticate-azure). -## Connecting a service user +## Connect a service user Because Azure DevOps forces all authentication to be linked to a user's permissions, we recommend you create a "service user" in Azure DevOps whose permissions will be used to power headless actions in dbt Cloud such as dbt Cloud project repo selection, deployment runs, and CI. A service user is a pseudo user set up in the same way an admin would set up a real user, but it's given permissions specifically scoped for service to service interactions. You should avoid linking authentication to a real Azure DevOps user because if this person leaves your organization, dbt Cloud will lose privileges to the dbt Azure DevOps repositories, causing production runs to fail. -### More on Service Users +:::info Service user authentication expiration +dbt Cloud will refresh the authentication for the service user on each run triggered by the scheduler, API, or CI. If your account does not have any active runs for over 90 days, an admin will need to manually refresh the authentication of the service user by disconnecting and reconnecting the service user's profile via the OAuth flow described above in order to resume headless interactions like project set up, deployment runs, and CI. + +::: + +### Service users permissions A service user account must have the following Azure DevOps permissions for all Azure DevOps projects and repos you want accessible in dbt Cloud. Read more about how dbt Cloud uses each permission in the following paragraphs. @@ -112,11 +117,44 @@ A service user account must have the following Azure DevOps permissions for all \* Note: **DeleteSubscriptions** permission might be included in **EditSubscriptions** depending on your version of Azure. -Some of these permissions are only accessible via the Azure DevOps API, for which documentation can be found [here](https://docs.microsoft.com/en-us/azure/devops/organizations/security/namespace-reference?view=azure-devops). We’ve also detailed more information on Azure DevOps API usage below to help accelerate the set up. Alternatively, you can use the Azure DevOps UI to enable permissions, but you cannot get the least permissioned set. +Some of these permissions are only accessible via the [Azure DevOps API](https://docs.microsoft.com/en-us/azure/devops/organizations/security/namespace-reference?view=azure-devops) or [CLI](https://learn.microsoft.com/en-us/cli/azure/devops?view=azure-cli-latest). We’ve also detailed more information on Azure DevOps API usage below to help accelerate the setup. Alternatively, you can use the Azure DevOps UI to enable permissions, but you cannot get the least permissioned set. -:::info Provide the service user with required permissions before setting up a dbt Cloud project -This service user's permissions will also power which repositories a team can select from during dbt project set up, so an Azure DevOps admin must grant at minimum Project Reader access to the service user before setting up a project in dbt Cloud. -::: + + + + + +The service user's permissions will also power which repositories a team can select from during dbt project set up, so an Azure DevOps admin must grant at minimum Project Reader access to the service user _before_ creating a new project in dbt Cloud. If you are migrating an existing dbt project to use the native Azure DevOps integration, the dbt Cloud account's service user must have proper permissions on the repository before migration. + + + + +While it's common to enforce multi-factor authentication (MFA) for normal user accounts, service user authentication must not need an extra factor. If you enable a second factor for the service user, this can interrupt production runs and cause a failure to clone the repository. In order for the OAuth access token to work, the best practice is to remove any more burden of proof of identity for service users. + +As a result, MFA must be explicity disabled in the Office 365 or Azure AD administration panel for the service user. Just having it "un-connected" will not be sufficient, as dbt Cloud will be prompted to set up MFA instead of allowing the credentials to be used as intended. + +**To disable MFA for a single user using the Office 365 Administration console:** + +- Go to Microsoft 365 admin center -> Users -> Active users -> Select the user -> Manage multifactor authentication -> Select the user -> Disable multi-factor authentication. + +**To use the Azure AD interface:** + +Note, this procedure involves disabling Security Defaults on AAD. + +1. Go to the AAD Admin Center. Scroll down to Azure Active Directory ->Manage ->Properties -> Manage Security defaults and then select **No** in "Enable Security Defaults" +2. Select **Save** +3. Go to **Azure Active Directory** -> Manage -> Users ->Click on the ellipsis (...) and then the Multi-Factor Authentication link. If the link is grayed out, you need to make sure you disable **Security Defaults** +4. The link will take you to a "multi-factor authentication" page. +5. If MFA is enabled for users, select the user(s) and select **Disable** under **Quick steps** +6. Select **Yes** to confirm your changes + +To re-enable MFA for that user, select them again and click **Enable**. Note you may have to go through MFA setup for that user after enabling it. + + + + + +
ViewSubscriptions @@ -141,7 +179,12 @@ This service user's permissions will also power which repositories a team can se - PublisherSecurity for access to all projects - PublisherSecurity/ for per project access -**UI/API:** API only +**UI/API/CLI:** API/CLI only + +**Sample CLI code snippet** +```bash +az devops security permission update --organization https://dev.azure.com/ --namespace-id cb594ebe-87dd-4fc9-ac2c-6a10a4c92046 --subject @xxxxxx.onmicrosoft.com --token PublisherSecurity/ --allow-bit 1 +```
@@ -169,7 +212,12 @@ This service user's permissions will also power which repositories a team can se - PublisherSecurity for access to all projects - PublisherSecurity/ for per project access -**UI/API:** API only +**UI/API/CLI:** API/CLI only + +**Sample CLI code snippet** +```bash +az devops security permission update --organization https://dev.azure.com/ --namespace-id cb594ebe-87dd-4fc9-ac2c-6a10a4c92046 --subject @xxxxxx.onmicrosoft.com --token PublisherSecurity/ --allow-bit 2 +```
@@ -199,7 +247,12 @@ This service user's permissions will also power which repositories a team can se - PublisherSecurity for access to all projects - PublisherSecurity/ for per project access -**UI/API:** API only +**UI/API/CLI:** API/CLI only + +**Sample CLI code snippet** +```bash +az devops security permission update --organization https://dev.azure.com/ --namespace-id cb594ebe-87dd-4fc9-ac2c-6a10a4c92046 --subject @xxxxxx.onmicrosoft.com --token PublisherSecurity/ --allow-bit 4 +``` **Additional Notes:** This permission has been deprecated in recent Azure DevOps versions. Edit Subscriptions (bit 2) has Delete permissions. @@ -233,7 +286,12 @@ This service user's permissions will also power which repositories a team can se - repoV2// for per repo access -**UI/API:** UI and API +**UI/API/CLI:** UI, API, and CLI + +**Sample CLI code snippet** +```bash +az devops security permission update --organization https://dev.azure.com/ --namespace-id 2e9eb7ed-3c0a-47d4-87c1-0ffdd275fd87 --subject @xxxxxx.onmicrosoft.com --token repoV2// --allow-bit 16384 +``` **Additional Notes:** This permission is automatically inherited if Project Reader/Contributor/Administrator is set in the UI. @@ -268,14 +326,19 @@ This service user's permissions will also power which repositories a team can se - repoV2// for access to a single repo at a time -**UI/API:** UI and API +**UI/API/CLI:** UI, API, and CLI + +**Sample CLI code snippet** +```bash +az devops security permission update --organization https://dev.azure.com/ --namespace-id 2e9eb7ed-3c0a-47d4-87c1-0ffdd275fd87 --subject @xxxxxx.onmicrosoft.com --token repoV2// --allow-bit 4 +``` **Additional Notes:** This permission is automatically inherited if Project Contributor/Administrator is set in the UI. -You must connect your service user before setting up a dbt Cloud project, as the the service user's permissions determine which projects dbt Cloud can import. +You must connect your service user before setting up a dbt Cloud project, as the service user's permissions determine which projects dbt Cloud can import. To connect the service user: 1. An admin must first be signed into the service user's Azure DevOps account. @@ -286,7 +349,8 @@ To connect the service user: Once connected, dbt Cloud displays the email address of the service user so you know which user's permissions are enabling headless actions in deployment environments. To change which account is connected, disconnect the profile in dbt Cloud, sign into the alternative Azure DevOps service account, and re-link the account in dbt Cloud. -:::info Service user authentication expiration -dbt Cloud will refresh the authentication for the service user on each run triggered by the scheduler, API, or CI. If your account does not have any active runs for over 90 days, an admin will need to manually refresh the authentication of the service user by disconnecting and reconnecting the service user's profile via the OAuth flow described above in order to resume headless interactions like project set up, deployment runs, and CI. +:::info Personal Access Tokens (PATs) +dbt Cloud generates temporary access tokens called Full-scoped PATs for service users to access APIs related to their dbt Cloud project. These tokens are only valid for a short period of 5 minutes and become invalid after they are used to make an API call. +The Azure DevOps Administrator can limit the creation of full-scoped PATs by enabling a policy that restricts users to a custom-defined set of scopes. By default, this policy is set to **off**, but enabling it will cause project setup to fail with an error. After disabling this policy and successfully setting up your project, if you wish to use finer-scoped permissions, some features such as webhooks for CI may be lost, so we recommend the service user has full-scoped PATs. To exclude the dbt Cloud service user from the global PAT policy, add them to the allow list as part of your security policy. ::: diff --git a/website/docs/docs/collaborate/manage-access/about-access.md b/website/docs/docs/cloud/manage-access/about-access.md similarity index 61% rename from website/docs/docs/collaborate/manage-access/about-access.md rename to website/docs/docs/cloud/manage-access/about-access.md index 844f49e6500..d394c79baa3 100644 --- a/website/docs/docs/collaborate/manage-access/about-access.md +++ b/website/docs/docs/cloud/manage-access/about-access.md @@ -1,9 +1,16 @@ --- -title: "About access" -id: "about-access" +title: "About user access in dbt Cloud" +description: "Learn how dbt Cloud administrators can use dbt Cloud's permissioning model to control user-level access in a dbt Cloud account." +id: "about-user-access" +pagination_next: "docs/cloud/manage-access/seats-and-users" +pagination_prev: null --- -## Overview +:::info "User access" is not "Model access" + +**User groups and access** and **model groups and access** mean two different things. "Model groups and access" is a specific term used in the language of dbt-core. Refer to [Model access](/docs/collaborate/govern/model-access) for more info on what it means in dbt-core. + +::: dbt Cloud administrators can use dbt Cloud's permissioning model to control user-level access in a dbt Cloud account. This access control comes in two flavors: @@ -17,6 +24,7 @@ License-based and Role-based. a member of multiple groups, and those groups may have permissions on multiple projects. + ## License-based access control Each user on an account is assigned a license type when the user is first @@ -24,14 +32,13 @@ invited to a given account. This license type may change over time, but a user can only have one type of license at any given time. A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's two license types are: - - **Read Only** - - **Developer** +to access. dbt Cloud's three license types are: + + - **Developer** — User may be granted _any_ permissions. + - **Read-Only** — User has read-only permissions applied to all dbt Cloud resources regardless of the role-based permissions that the user is assigned. + - **IT** — User has [Security Admin](/docs/cloud/manage-access/enterprise-permissions#security-admin) and [Billing Admin](/docs/cloud/manage-access/enterprise-permissions#billing-admin) permissions applied regardless of the role-based permissions that the user is assigned. -For more information on these license types, see [Seats & Users](cloud-seats-and-users). -At a high level, Developers may be granted _any_ permissions, whereas Read Only -users will have read-only permissions applied to all dbt Cloud resources -regardless of the role-based permissions that the user is assigned. +For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). ## Role-based access control @@ -71,7 +78,7 @@ page in your Account Settings. /> -### SSO Mappings +### SSO mappings SSO Mappings connect Identity Provider (IdP) group membership to dbt Cloud group membership. When a user logs into dbt Cloud via a supported identity provider, @@ -89,7 +96,7 @@ groups. ::: -### Permission Sets +### Permission sets Permission sets are predefined collections of granular permissions. Permission sets combine low-level permission grants into high-level roles that can be @@ -100,7 +107,7 @@ assigned to groups. Some examples of existing permission sets are: - Job Viewer - ...and more -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/collaborate/manage-access/enterprise-permissions). +For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). These permission sets are available for assignment to groups and control the ability for users in these groups to take specific actions in the dbt Cloud application. @@ -116,12 +123,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust @@ -136,15 +137,20 @@ sign-in time based on the user's IdP-provided group membership information. ## FAQs -- **When are IdP group memberships updated for SSO Mapped groups?** Group memberships - are updated every time a user logs into dbt Cloud via a supported SSO provider. If - you've changed group memberships in your identity provider or dbt Cloud, ask your - users to log back into dbt Cloud for these group memberships to be synchronized. - -- **Can I set up SSO without RBAC?** Yes, see the documentation on - [Manual Assignment](#manual-assignment) above for more information on using - SSO without RBAC. - -- **Can I configure a user's License Type based on IdP Attributes?** Yes, see - the docs on [managing license types](/cloud-seats-and-users#managing-license-types) - for more information. + +- **When are IdP group memberships updated for SSO Mapped groups?**
+ Group memberships are updated whenever a user logs into dbt Cloud via a supported SSO provider. If you've changed group memberships in your identity provider or dbt Cloud, ask your users to log back into dbt Cloud to synchronize these group memberships. +- **Can I set up SSO without RBAC?**
+Yes, see the documentation on [Manual Assignment](#manual-assignment) above for more information on using SSO without RBAC. +- **Can I configure a user's License Type based on IdP Attributes?**
+ Yes, see the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. + +- **Why can't I edit a user's group membership?**
+Make sure you're not trying to edit your own user as this isn't allowed for security reasons. To edit the group membership of your own user, you'll need a different user to make those changes. + +- **How do I add or remove users**?
+Each dbt Cloud plan comes with a base number of Developer and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. + - If you're on an Enterprise plans and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove developers by adjusting your developer user seat count in **Account settings** -> **Users**. + - If you're on a Team plan and have the correct [permissions](/docs/cloud/manage-access/self-service-permissions), you can add or remove developers by making two changes: adjust your developer user seat count AND your developer billing seat count in **Account settings** -> **Users** and then in **Account settings** -> **Billing**. + + Refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users#licenses) for detailed steps. diff --git a/website/docs/docs/collaborate/manage-access/audit-log.md b/website/docs/docs/cloud/manage-access/audit-log.md similarity index 92% rename from website/docs/docs/collaborate/manage-access/audit-log.md rename to website/docs/docs/cloud/manage-access/audit-log.md index 78d59d9a0a2..b90bceef570 100644 --- a/website/docs/docs/collaborate/manage-access/audit-log.md +++ b/website/docs/docs/cloud/manage-access/audit-log.md @@ -3,26 +3,24 @@ title: "The audit log for dbt Cloud Enterprise" id: audit-log description: "You can troubleshoot possible issues and provide security audits by reviewing event activity in your organization." sidebar_label: "Audit log" +pagination_next: null +pagination_prev: "docs/cloud/manage-access/about-user-access" --- -To review actions performed by people in your organization, dbt provides logs of audited user and system events. You can use the audit log to quickly review the actions performed by members of your organization. The audit log includes details such as who performed the action, what the action was, and when it was performed. You can use these details to troubleshoot access issues, perform security audits, or analyze specific events. +To review actions performed by people in your organization, dbt provides logs of audited user and system events in real time. The audit log appears as events happen and includes details such as who performed the action, what the action was, and when it was performed. You can use these details to troubleshoot access issues, perform security audits, or analyze specific events. You must be an **Account Admin** to access the audit log and this feature is only available on Enterprise plans. -The dbt Cloud audit log stores all the events that occurred in your organization: +The dbt Cloud audit log stores all the events that occurred in your organization in real-time, including: - For events within 90 days, the dbt Cloud audit log has a selectable date range that lists events triggered. - For events beyond 90 days, **Account Admins** can [export all events](#exporting-logs) by using **Export All**. ## Accessing the audit log -To access audit log, click the gear icon in the top right, then click **Audit Log**. +To access the audit log, click the gear icon in the top right, then click **Audit Log**. -
- - - -
+ ## Understanding the audit log @@ -161,19 +159,17 @@ The audit log supports various events for different objects in dbt Cloud. You wi You can search the audit log to find a specific event or actor, which is limited to the ones listed in [Events in audit log](#events-in-audit-log). The audit log successfully lists historical events spanning the last 90 days. You can search for an actor or event using the search bar, and then narrow your results using the time window. -
- + -
## Exporting logs You can use the audit log to export all historical audit results for security, compliance, and analysis purposes: -- For events within 90 days — dbt Cloud will automatically display the 90 days selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. +- For events within 90 days — dbt Cloud will automatically display the 90-day selectable date range. Select **Export Selection** to download a CSV file of all the events that occurred in your organization within 90 days. - For events beyond 90 days — Select **Export All**. The Account Admin will receive an email link to download a CSV file of all the events that occurred in your organization. - + diff --git a/website/docs/docs/cloud/manage-access/auth0-migration.md b/website/docs/docs/cloud/manage-access/auth0-migration.md new file mode 100644 index 00000000000..0d7b715b6c6 --- /dev/null +++ b/website/docs/docs/cloud/manage-access/auth0-migration.md @@ -0,0 +1,124 @@ +--- +title: "Migrating to Auth0 for SSO" +id: "auth0-migration" +sidebar: "SSO Auth0 Migration" +description: "Required actions for migrating to Auth0 for SSO services on dbt Cloud." +--- +:::warning Limited availability + +This is a new feature that is being implemented incrementally to customers using single sign-on features today. If you have any questions or concerns about the availability of the migration feature, please [contact support](mailto:support@getdbt.com). + +::: + +dbt Labs is partnering with Auth0 to bring enhanced features to dbt Cloud's single sign-on (SSO) capabilities. Auth0 is an identity and access management (IAM) platform with advanced security features, and it will be leveraged by dbt Cloud. These changes will require some action from customers with SSO configured in dbt Cloud today, and this guide will outline the necessary changes for each environment. + +If you have not yet configured SSO in dbt Cloud, refer instead to our setup guides for [SAML](/docs/cloud/manage-access/set-up-sso-saml-2.0), [Okta](/docs/cloud/manage-access/set-up-sso-okta), [Google Workspace](/docs/cloud/manage-access/set-up-sso-google-workspace), or [Azure Active Directory](/docs/cloud/manage-access/set-up-sso-azure-active-directory) single sign-on services. + +## Auth0 Multi-tenant URIs + + + +## Start the migration + +The Auth0 migration feature is being rolled out incrementally to customers who have SSO features already enabled. When the migration option has been enabled on your account, you will see **SSO Updates Available** on the right side of the menu bar, near the settings icon. + + + +Alternatively, you can start the process from the **Settings** page in the **Single Sign-on** pane. Click the **Begin Migration** button to start. + + + +Once you have opted to begin the migration process, the following steps will vary depending on the configured identity provider. You can just skip to the section that's right for your environment. These steps only apply to customers going through the migration; new setups will use the existing [setup instructions](/docs/cloud/manage-access/sso-overview). + +:::warning Login {slug} + +Slugs should contain only letters, numbers, and dashes. Make sure to remove underscores (if they exist) from login slugs: +* before migrating on the **Account Settings** page, or +* while migrating (before enabling), as shown in the Migrate authentication screenshots for your respective setup. +After changing the slug, admins must share the new login URL with their dbt Cloud users. + +::: + +## SAML 2.0 and Okta + +SAML 2.0 users must update a few fields in the SSO app configuration to match the new Auth0 URL and URI. You can approach this by editing the existing SSO app settings or creating a new one to accommodate the Auth0 settings. One approach isn't inherently better, so you can choose whichever works best for your organization. + +The fields that will be updated are: +- Single sign-on URL — `https:///login/callback?connection={slug}` +- Audience URI (SP Entity ID) — `urn:auth0::{slug}` + +Below are sample steps to update. You must complete all of them to ensure uninterrupted access to dbt Cloud and you should coordinate with your identity provider admin when making these changes. + +1. Replace `{slug}` with your organization’s login slug. It must be unique across all dbt Cloud instances and is usually something like your company name separated by dashes (for example, `dbt-labs`). + +Here is an example of an updated SAML 2.0 setup in Okta. + + + +2. Save the configuration, and your SAML settings will look something like this: + + + +3. Toggle the `Enable new SSO authentication` option to ensure the traffic is routed correctly. _The new SSO migration action is final and cannot be undone_ + + + +4. Save the settings and test the new configuration using the SSO login URL provided on the settings page. + +## Google Workspace + +Google Workspace admins updating their SSO APIs with the Auth0 URL won't have to do much if it is an existing setup. This can be done as a new project or by editing an existing SSO setup. No additional scopes are needed since this is migrating from an existing setup. All scopes were defined during the initial configuration. + +Below are steps to update. You must complete all of them to ensure uninterrupted access to dbt Cloud and you should coordinate with your identity provider admin when making these changes. + +1. Open the [Google Cloud console](https://console.cloud.google.com/) and select the project with your dbt Cloud single sign-on settings. From the project page **Quick Access**, select **APIs and Services** + + + +2. Click **Credentials** from the left side pane and click the appropriate name from **OAuth 2.0 Client IDs** + + + +3. In the **Client ID for Web application** window, find the **Authorized Redirect URIs** field and click **Add URI** and enter `https:///login/callback`. + +Click **Save** once you are done. + + + +4. _You will need a person with Google Workspace admin privileges to complete these steps in dbt Cloud_. In dbt Cloud, navigate to the **Account Settings**, click on **Single Sign-on**, and then click **Edit** on the right side of the SSO pane. Toggle the **Enable New SSO Authentication** option and select **Save**. This will trigger an authorization window from Google that will require admin credentials. _The migration action is final and cannot be undone_. Once the authentication has gone through, test the new configuration using the SSO login URL provided on the settings page. + +:::warning Domain authorization + +You must complete the domain authorization before you toggle `Enable New SSO Authentication`, or the migration will not complete successfully. + +::: + + + +## Azure Active Directory + +Azure Active Directory admins will need to make a slight adjustment to the existing authentication app in the Azure AD portal. This migration does not require that the entire app be deleted or recreated; you can edit the existing app. Start by opening the Azure portal and navigating to the Active Directory overview. + +Below are steps to update. You must complete all of them to ensure uninterrupted access to dbt Cloud and you should coordinate with your identity provider admin when making these changes. + +1. Click **App Registrations** on the left side menu. + + + +2. Select the proper **dbt Cloud** app (name may vary) from the list. From the app overview, click on the hyperlink next to **Redirect URI** + + + +3. In the **Web** pane with **Redirect URIs**, click **Add URI** and enter the appropriate `https:///login/callback`. Save the settings and verify it is counted in the updated app overview. + + + +4. Navigate to the dbt Cloud environment and open the **Account Settings**. Click the **Single Sign-on** option from the left side menu and click the **Edit** option from the right side of the SSO pane. The **domain** field is the domain your organization uses to login to Azure AD. Toggle the **Enable New SSO Authentication** option and **Save**. _Once this option is enabled, it cannot be undone._ + +:::warning Domain authorization + +You must complete the domain authorization before you toggle `Enable New SSO Authentication`, or the migration will not complete successfully. + +::: + + diff --git a/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md new file mode 100644 index 00000000000..24c64a5abed --- /dev/null +++ b/website/docs/docs/cloud/manage-access/cloud-seats-and-users.md @@ -0,0 +1,179 @@ +--- +title: "Users and licenses" +description: "Learn how dbt Cloud administrators can use licenses and seats to control access in a dbt Cloud account." +id: "seats-and-users" +sidebar: "Users and licenses" +pagination_next: "docs/cloud/manage-access/self-service-permissions" +pagination_prev: null +--- + +In dbt Cloud, _licenses_ are used to allocate users to your account. There are three different types of licenses in dbt Cloud: + +- **Developer** — Granted access to the Deployment and [Development](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) functionality in dbt Cloud. +- **Read-Only** — Intended to view the [artifacts](/docs/deploy/artifacts) created in a dbt Cloud account. Read-Only users can receive job notifications but not configure them. +- **IT** — Can manage users, groups, and licenses, among other permissions. IT users can receive job notifications but not configure them. Available on Enterprise and Team plans only. + +The user's assigned license determines the specific capabilities they can access in dbt Cloud. + +| Functionality | Developer User | Read-Only Users | IT Users* | +| ------------- | -------------- | --------------- | -------- | +| Use the dbt Cloud IDE | ✅ | ❌ | ❌ | +| Use the dbt Cloud CLI | ✅ | ❌ | ❌ | +| Use Jobs | ✅ | ❌ | ❌ | +| Manage Account | ✅ | ❌ | ✅ | +| API Access | ✅ | ❌ | ❌ | +| Use [Source Freshness](/docs/deploy/source-freshness) | ✅ | ✅ | ❌ | +| Use [Docs](/docs/collaborate/build-and-view-your-docs) | ✅ | ✅ | ❌ | +| Receive [Job notifications](/docs/deploy/job-notifications) | ✅ | ✅ | ✅ | +*Available on Enterprise and Team plans only and doesn't count toward seat usage. Please note, that IT seats are limited to 1 seat per Team or Enterprise account. + +## Licenses + +Each dbt Cloud plan comes with a base number of Developer, IT, and Read-Only licenses. You can add or remove licenses by modifying the number of users in your account settings. + +If you have a Developer plan account and want to add more people to your team, you'll need to upgrade to the Team plan. Refer to [dbt Pricing Plans](https://www.getdbt.com/pricing/) for more information about licenses available with each plan. + +The following tabs detail steps on how to modify your user license count: + + + + + +If you're on an Enterprise plan and have the correct [permissions](/docs/cloud/manage-access/enterprise-permissions), you can add or remove licenses by adjusting your user seat count. Note, an IT license does not count toward seat usage. + +- To remove a user, go to **Account Settings**, select **Users** under **Teams**. Select the user you want to remove, click **Edit**, and then **Delete**. This action cannot be undone. However, you can re-invite the user with the same info if you deleted the user in error.
+ +- To add a user, go to **Account Settings**, select **Users** under **Teams**. Select **Invite Users**. For fine-grained permission configuration, refer to [Role based access control](/docs/cloud/manage-access/enterprise-permissions). + + + +
+ + + +If you're on a Team plan and have the correct [permissions](/docs/cloud/manage-access/self-service-permissions), you can add or remove developers. You'll need to make two changes: + +- Adjust your developer user seat count, which manages the users invited to your dbt Cloud project. AND +- Adjust your developer billing seat count, which manages the number of billable seats. + + +You can add or remove developers by increasing or decreasing the number of users and billable seats in your account settings: + + + + +To add a user in dbt Cloud, you must be an account owner or have admin privileges. + +1. From dbt Cloud, click the gear icon at the top right and select **Account Settings**. + + + +2. In **Account Settings**, select **Billing**. +3. Enter the number of developer seats you want and make sure you fill in all the payment details, including the **Billing Address** section. Leaving these blank won't allow you to save your changes. +4. Press **Update Payment Information** to save your changes. + + + + + +Now that you've updated your billing, you can now invite users to join your dbt Cloud account: + +4. In **Account Settings**, select **Users** under **Teams**. +5. Select the user you want to add by clicking **Invite Users**. +6. In the **Invite Users** side panel, add the invited user's email(s), assign their license, and Groups. +7. Click **Send Invitations** at the bottom of the page. + + + +Great work! After completing those these steps, your dbt Cloud user count and billing count should now be the same. + + + + +To delete a user in dbt Cloud, you must be an account owner or have admin privileges. If the user has a `developer` license type, this will open up their seat for another user or allow the admins to lower the total number of seats. + +1. From dbt Cloud, click the gear icon at the top right and select **Account Settings**. + + + +2. In **Account Settings**, select **Users** under **Teams**. +3. Select the user you want to delete, then click **Edit**. +4. Click **Delete** in the bottom left. Click **Confirm Delete** to immediately delete the user without additional password prompts. This action cannot be undone. However, you can re-invite the user with the same information if the deletion was made in error. + + + + +If you are on a **Teams** plan and you're deleting users to reduce the number of billable seats, follow these steps to lower the license count to avoid being overcharged: + +1. In **Account Settings**, select **Billing**. +2. Enter the number of developer seats you want and make sure you fill in all the payment details, including the **Billing Address** section. If you leave any field blank, you won't be able to save your changes. +3. Click **Update Payment Information** to save your changes. + + + +Great work! After completing these steps, your dbt Cloud user count and billing count should now be the same. + + + + + +
+ +## Managing license types + +Licenses can be assigned manually, or automatically based on IdP configuration +(enterprise only). By default, new users in an account will be assigned a +Developer license. + +### Manual configuration + +To manually assign a specific type of license to a user on your team, navigate +to the Team page in your Account Settings and click the "edit" button for the user +you want to manage. From this page, you can select the license type and relevant +groups for the user. + +**Note:** You will need to have an available license ready +to allocate for the user. If your account does not have an available license to +allocate, you will need to add more licenses to your plan to complete the license +change. + + + +### Mapped configuration + +**Note:** This feature is only available on the Enterprise plan. + +If your account is connected to an Identity Provider (IdP) for [Single Sign +On](/docs/cloud/manage-access/sso-overview), you can automatically map IdP user +groups to specific license types in dbt Cloud. To configure license mappings, +navigate to the Account Settings > Team > License Mappings page. From +here, you can create or edit SSO mappings for both Read-Only and Developer +license types. + +By default, all new members of a dbt Cloud account will be assigned a Developer +license. To assign Read-Only licenses to certain groups of users, create a new +License Mapping for the Read-Only license type and include a comma separated +list of IdP group names that should receive a Read-Only license at sign-in time. + + + +Usage notes: +- If a user's IdP groups match both a Developer and Read-Only license type + mapping, a Developer license type will be assigned +- If a user's IdP groups do not match _any_ license type mappings, a Developer + license will be assigned +- License types are adjusted when users sign into dbt Cloud via Single Sign On. + Changes made to license type mappings will take effect the next time users + sign in to dbt Cloud. +- License type mappings are based on _IdP Groups_, not _dbt Cloud groups_, so be + sure to check group memberships in your identity provider when configuring + this feature. + + +## Granular permissioning + +The dbt Cloud Enterprise plan supports Role-Based access controls for +configuring granular in-app permissions. See [access control](/docs/cloud/manage-access/about-user-access) +for more information on Enterprise permissioning. diff --git a/website/docs/docs/cloud/manage-access/enterprise-permissions.md b/website/docs/docs/cloud/manage-access/enterprise-permissions.md new file mode 100644 index 00000000000..dcacda20deb --- /dev/null +++ b/website/docs/docs/cloud/manage-access/enterprise-permissions.md @@ -0,0 +1,38 @@ +--- +title: "Enterprise permissions" +id: "enterprise-permissions" +description: "Permission sets for Enterprise plans." +hide_table_of_contents: true #For the sake of the tables on this page +pagination_next: null +--- + +import Permissions from '/snippets/_enterprise-permissions-table.md'; +import SetUpPages from '/snippets/_available-enterprise-only.md'; + + + +The dbt Cloud Enterprise plan supports a number of pre-built permission sets to +help manage access controls within a dbt Cloud account. See the docs on [access +control](/docs/cloud/manage-access/about-user-access) for more information on Role-Based access +control (RBAC). + +## Roles and permissions + +The following roles and permission sets are available for assignment in dbt Cloud Enterprise accounts. They can be granted to dbt Cloud groups which are then in turn granted to users. A dbt Cloud group can be associated with more than one role and permission set. Roles with more access take precedence. + + + +## How to set up RBAC Groups in dbt Cloud + +Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. + +1. Click the gear icon to the top right and select **Account Settings**. From the **Team** section, click **Groups** + + + +1. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. +2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case-sensitive and must match the source group formatting. +3. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. + + +4. When you've completed your configurations, click **Save**. Users will begin to populate the group automatically once they have signed in to dbt Cloud with their SSO credentials. diff --git a/website/docs/docs/collaborate/manage-access/licenses-and-groups.md b/website/docs/docs/cloud/manage-access/licenses-and-groups.md similarity index 92% rename from website/docs/docs/collaborate/manage-access/licenses-and-groups.md rename to website/docs/docs/cloud/manage-access/licenses-and-groups.md index da59601f508..83b926c7445 100644 --- a/website/docs/docs/collaborate/manage-access/licenses-and-groups.md +++ b/website/docs/docs/cloud/manage-access/licenses-and-groups.md @@ -24,14 +24,15 @@ invited to a given account. This license type may change over time, but a user can only have one type of license at any given time. A user's license type controls the features in dbt Cloud that the user is able -to access. dbt Cloud's two license types are: - - **Read Only** +to access. dbt Cloud's three license types are: + - **Read-Only** - **Developer** + - **IT** -For more information on these license types, see [Seats & Users](cloud-seats-and-users). -At a high-level, Developers may be granted _any_ permissions, whereas Read Only +For more information on these license types, see [Seats & Users](/docs/cloud/manage-access/seats-and-users). +At a high-level, Developers may be granted _any_ permissions, whereas Read-Only users will have read-only permissions applied to all dbt Cloud resources -regardless of the role-based permissions that the user is assigned. +regardless of the role-based permissions that the user is assigned. IT users will have Security Admin and Billing Admin permissions applied regardless of the role-based permissions that the user is assigned. ## Role-based access control @@ -100,7 +101,7 @@ assigned to groups. Some examples of existing permission sets are: - Job Viewer - ...and more -For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/collaborate/manage-access/enterprise-permissions). +For a full list of enterprise permission sets, see [Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions). These permission sets are available for assignment to groups and control the ability for users in these groups to take specific actions in the dbt Cloud application. @@ -116,12 +117,6 @@ set on the _Internal Analytics_ project. ### Manual assignment - - -- New in version 1.1.23 (March, 2021) - - - dbt Cloud administrators can manually assign users to groups independently of IdP attributes. If a dbt Cloud group is configured _without_ any SSO Mappings, then the group will be _unmanaged_ and dbt Cloud will not adjust @@ -146,5 +141,5 @@ sign-in time based on the user's IdP-provided group membership information. SSO without RBAC. - **Can I configure a user's License Type based on IdP Attributes?** Yes, see - the docs on [managing license types](/cloud-seats-and-users#managing-license-types) + the docs on [managing license types](/docs/cloud/manage-access/seats-and-users#managing-license-types) for more information. diff --git a/website/docs/docs/collaborate/manage-access/self-service-permissions.md b/website/docs/docs/cloud/manage-access/self-service-permissions.md similarity index 81% rename from website/docs/docs/collaborate/manage-access/self-service-permissions.md rename to website/docs/docs/cloud/manage-access/self-service-permissions.md index 0d6375452aa..d3c9cf8f5ea 100644 --- a/website/docs/docs/collaborate/manage-access/self-service-permissions.md +++ b/website/docs/docs/cloud/manage-access/self-service-permissions.md @@ -1,8 +1,8 @@ --- title: "Self-service permissions" +description: "Learn how dbt Cloud administrators can use self-service permissions to control access in a dbt Cloud account." id: "self-service-permissions" --- -## Overview dbt Cloud supports two different permission sets to manage permissions for self-service accounts: **Member** and **Owner**. @@ -12,15 +12,16 @@ The permissions afforded to each role are described below: | ------ | ------ | ----- | | View and edit resources | ✅ | ✅ | | Trigger runs | ✅ | ✅ | -| Access the IDE | ✅ | ✅ | +| Access the dbt Cloud IDE | ✅ | ✅ | +| Access the dbt Cloud CLI | ✅ | ✅ | | Invite Members to the account | ✅ | ✅ | | Manage billing | ❌ | ✅ | | Manage team permissions | ❌ | ✅ | | Invite Owners to the account | ❌ | ✅ | -## Read Only vs. Developer License Types +## Read-Only vs. Developer License Types -Users configured with Read Only license types will experience a restricted set of permissions in dbt Cloud. If a user is associated with a _Member_ permission set and a Read Only seat license, then they will only have access to what a Read-Only seat allows. See [Seats and Users](cloud-seats-and-users) for more information on the impact of licenses on these permissions. +Users configured with Read-Only license types will experience a restricted set of permissions in dbt Cloud. If a user is associated with a _Member_ permission set and a Read-Only seat license, then they will only have access to what a Read-Only seat allows. See [Seats and Users](/docs/cloud/manage-access/seats-and-users) for more information on the impact of licenses on these permissions. ## Owner and Member Groups in dbt Cloud Enterprise @@ -30,6 +31,7 @@ You will need owner and member groups to help with account onboarding, but these After onboarding administrative users and configuring RBAC/SSO groups, we recommend the following steps for onboarding users to a dbt Cloud Enterprise account. + ### Prerequisites You need to create an Account Admins group before removing any other groups. diff --git a/website/docs/docs/collaborate/manage-access/set-up-bigquery-oauth.md b/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md similarity index 86% rename from website/docs/docs/collaborate/manage-access/set-up-bigquery-oauth.md rename to website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md index e807431245a..1b8f9ea7357 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-bigquery-oauth.md +++ b/website/docs/docs/cloud/manage-access/set-up-bigquery-oauth.md @@ -1,6 +1,8 @@ --- title: "Set up BigQuery OAuth" +description: "Learn how dbt Cloud administrators can use BigQuery OAuth to control access in a dbt Cloud account" id: "set-up-bigquery-oauth" +pagination_next: null --- :::info Enterprise Feature @@ -8,7 +10,6 @@ id: "set-up-bigquery-oauth" This guide describes a feature of the dbt Cloud Enterprise plan. If you’re interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. ::: -### Overview dbt Cloud supports developer [OAuth](https://cloud.google.com/bigquery/docs/authentication) with BigQuery, providing an additional layer of security for dbt enterprise users. When BigQuery OAuth is enabled for a dbt Cloud project, all dbt Cloud developers must authenticate with BigQuery in order to use the dbt Cloud IDE. The project's deployment environments will still leverage the BigQuery service account key set in the project credentials. @@ -33,17 +34,15 @@ On the **Credentials** page, you can see your existing keys, client IDs, and ser Set up an [OAuth consent screen](https://support.google.com/cloud/answer/6158849) if you haven't already. Then, click **+ Create Credentials** at the top of the page and select **OAuth client ID**. -Fill in the application details as follows: +Fill in the application, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: | Config | Value | | ------ | ----- | | **Application type** | Web application | | **Name** | dbt Cloud | -| **Authorized Javascript origins** | https://cloud.getdbt.com | -| **Authorized Redirect URIs** | https://cloud.getdbt.com/complete/bigquery | +| **Authorized Javascript origins** | https://YOUR_ACCESS_URL | +| **Authorized Redirect URIs** | https://YOUR_ACCESS_URL/complete/bigquery | -If you're a dbt Cloud single tenant customer, you need to replace `cloud.getdbt.com` with the hostname of -your dbt Cloud instance. Then click **Create** to create the BigQuery OAuth app and see the app client ID and secret values. These values are available even if you close the app screen, so this isn't the only chance you have to save them. @@ -53,7 +52,7 @@ Then click **Create** to create the BigQuery OAuth app and see the app client ID ### Configure the Connection in dbt Cloud Now that you have an OAuth app set up in BigQuery, you'll need to add the client ID and secret to dbt Cloud. To do so: - - go to [Settings](https://cloud.getdbt.com/next/settings/profile) + - go to Settings by clicking the gear in the top right. - on the left, select **Projects** under **Account Settings** - choose your project from the list - select **Connection** to edit the connection details @@ -64,7 +63,9 @@ Now that you have an OAuth app set up in BigQuery, you'll need to add the client ### Authenticating to BigQuery Once the BigQuery OAuth app is set up for a dbt Cloud project, each dbt Cloud user will need to authenticate with BigQuery in order to use the IDE. To do so: -- go to the [Credentials](https://cloud.getdbt.com/next/settings/profile#credentials) section + +- Click the gear icon at the top right and select **Profile settings**. +- Select **Credentials**. - choose your project from the list - select **Authenticate BigQuery Account** diff --git a/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md b/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md new file mode 100644 index 00000000000..679133b7844 --- /dev/null +++ b/website/docs/docs/cloud/manage-access/set-up-databricks-oauth.md @@ -0,0 +1,77 @@ +--- +title: "Set up Databricks OAuth" +description: "Learn how dbt Cloud administrators can use Databricks OAuth to control access in a dbt Cloud account." +id: "set-up-databricks-oauth" +--- + +:::info Enterprise Feature + +This guide describes a feature of the dbt Cloud Enterprise plan. If you’re interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. + +::: + +dbt Cloud supports developer OAuth ([OAuth for partner solutions](https://docs.databricks.com/en/integrations/manage-oauth.html)) with Databricks, providing an additional layer of security for dbt enterprise users. When you enable Databricks OAuth for a dbt Cloud project, all dbt Cloud developers must authenticate with Databricks in order to use the dbt Cloud IDE. The project's deployment environments will still leverage the Databricks authentication method set at the environment level. + +:::tip Beta Feature + +Databricks OAuth support in dbt Cloud is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud) and subject to change without notification. More updates to this feature coming soon. + +Current limitations: +- Databrick's OAuth applications are in public preview +- The current experience requires the IDE to be restarted every hour (access tokens expire after 1 hour - [workaround](https://docs.databricks.com/en/integrations/manage-oauth.html#override-the-default-token-lifetime-policy-for-dbt-core-power-bi-or-tableau-desktop)) + +::: + +### Configure Databricks OAuth (Databricks admin) + +To get started, you will need to [add dbt as an OAuth application](https://docs.databricks.com/en/integrations/configure-oauth-dbt.html) with Databricks, in 2 steps: + +1. From your terminal, [authenticate to the Databricks Account API](https://docs.databricks.com/en/integrations/configure-oauth-dbt.html#authenticate-to-the-account-api) with the Databricks CLI. You authenticate using: + - OAuth for users ([prerequisites](https://docs.databricks.com/en/dev-tools/auth.html#oauth-u2m-auth)) + - Oauth for service principals ([prerequisites](https://docs.databricks.com/en/dev-tools/auth.html#oauth-m2m-auth)) + - Username and password (must be account admin) +2. In the same terminal, **add dbt Cloud as an OAuth application** using `curl` and the [OAuth Custom App Integration API](https://docs.databricks.com/api/account/customappintegration/create) + +For the second step, you can use this example `curl` to authenticate with your username and password, replacing values as defined in the following table: + +```shell +curl -u USERNAME:PASSWORD https://accounts.cloud.databricks.com/api/2.0/accounts/ACCOUNT_ID/oauth2/custom-app-integrations -d '{"redirect_urls": ["https://YOUR_ACCESS_URL", "https://YOUR_ACCESS_URL/complete/databricks"], "confidential": true, "name": "NAME", "scopes": ["sql", "offline_access"]}' +``` + +These parameters and descriptions will help you authenticate with your username and password: + +| Parameter | Description | +| ------ | ----- | +| **USERNAME** | Your Databricks username (account admin level) | +| **PASSWORD** | Your Databricks password (account admin level) | +| **ACCOUNT_ID** | Your Databricks [account ID](https://docs.databricks.com/en/administration-guide/account-settings/index.html#locate-your-account-id) | +| **YOUR_ACCESS_URL** | The [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your dbt Cloud account region and plan | +| **NAME** | The integration name (i.e 'databricks-dbt-cloud') + +After running the `curl`, you'll get an API response that includes the `client_id` and `client_secret` required in the following section. At this time, this is the only way to retrieve the secret. If you lose the secret, then the integration needs to be [deleted](https://docs.databricks.com/api/account/customappintegration/delete) and re-created. + + +### Configure the Connection in dbt Cloud (dbt Cloud project admin) + +Now that you have an OAuth app set up in Databricks, you'll need to add the client ID and secret to dbt Cloud. To do so: + - go to Settings by clicking the gear in the top right. + - on the left, select **Projects** under **Account Settings** + - choose your project from the list + - select **Connection** to edit the connection details + - add the `OAuth Client ID` and `OAuth Client Secret` from the Databricks OAuth app under the **Optional Settings** section + + + +### Authenticating to Databricks (dbt Cloud IDE developer) + +Once the Databricks connection via OAuth is set up for a dbt Cloud project, each dbt Cloud user will need to authenticate with Databricks in order to use the IDE. To do so: + +- Click the gear icon at the top right and select **Profile settings**. +- Select **Credentials**. +- Choose your project from the list +- Select `OAuth` as the authentication method, and click **Save** +- Finalize by clicking the **Connect Databricks Account** button + + + +You will then be redirected to Databricks and asked to approve the connection. This redirects you back to dbt Cloud. You should now be an authenticated Databricks user, ready to use the dbt Cloud IDE. diff --git a/website/docs/docs/collaborate/manage-access/set-up-snowflake-oauth.md b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md similarity index 74% rename from website/docs/docs/collaborate/manage-access/set-up-snowflake-oauth.md rename to website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md index 270ceadf6c8..5b9abb6058a 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-snowflake-oauth.md +++ b/website/docs/docs/cloud/manage-access/set-up-snowflake-oauth.md @@ -1,5 +1,6 @@ --- title: "Set up Snowflake OAuth" +description: "Learn how dbt Cloud administrators can use Snowflake OAuth to control access in a dbt Cloud account." id: "set-up-snowflake-oauth" --- @@ -16,7 +17,7 @@ To enable Snowflake OAuth, you will need to create a [security integration](http ### Create a security integration -In Snowflake, execute a query to create a security integration. Please find the complete documentation on creating a security integration for custom clients [here](https://docs.snowflake.net/manuals/sql-reference/sql/create-security-integration.html#syntax). You can find a sample `create or replace security integration` query below. +In Snowflake, execute a query to create a security integration. Please find the complete documentation on creating a security integration for custom clients [here](https://docs.snowflake.net/manuals/sql-reference/sql/create-security-integration.html#syntax). In the following example `create or replace security integration` query, replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. ``` CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD @@ -24,7 +25,7 @@ CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD ENABLED = TRUE OAUTH_CLIENT = CUSTOM OAUTH_CLIENT_TYPE = 'CONFIDENTIAL' - OAUTH_REDIRECT_URI = 'https://cloud.getdbt.com/complete/snowflake' + OAUTH_REDIRECT_URI = 'https://YOUR_ACCESS_URL/complete/snowflake' OAUTH_ISSUE_REFRESH_TOKENS = TRUE OAUTH_REFRESH_TOKEN_VALIDITY = 7776000; ``` @@ -41,7 +42,7 @@ CREATE OR REPLACE SECURITY INTEGRATION DBT_CLOUD | ENABLED | Required | | OAUTH_CLIENT | Required | | OAUTH_CLIENT_TYPE | Required | -| OAUTH_REDIRECT_URI | Required. Use the access URL that corresponds to your server [region](/docs/deploy/regions). If dbt Cloud is deployed on-premises, use the domain name of your application instead of the access URL. | +| OAUTH_REDIRECT_URI | Required. Use the access URL that corresponds to your server [region](/docs/cloud/about-cloud/regions-ip-addresses). If dbt Cloud is deployed on-premises, use the domain name of your application instead of the access URL. | | OAUTH_ISSUE_REFRESH_TOKENS | Required | | OAUTH_REFRESH_TOKEN_VALIDITY | Required. This configuration dictates the number of seconds that a refresh token is valid for. Use a smaller value to force users to re-authenticate with Snowflake more frequently. | @@ -95,7 +96,20 @@ When clicking on the `Connect Snowflake Account` successfully redirects you to t * Your user might not have access to the Snowflake role defined on the development credentials in dbt Cloud. Double-check that you have access to that role and if the role name has been correctly entered in as Snowflake is case sensitive. * You're trying to use a role that is in the [BLOCKED_ROLES_LIST](https://docs.snowflake.com/en/user-guide/oauth-partner.html#blocking-specific-roles-from-using-the-integration), such as `ACCOUNTADMIN`. +#### The requested scope is invalid +When you select the `Connect Snowflake Account` button to try to connect to your Snowflake account, you might get an error that says `The requested scope is invalid` even though you were redirected to the Snowflake login page successfully. + +This error might be because of a configuration issue in the Snowflake OAuth flow, where the `role` in the profile config is mandatory for each user and doesn't inherit it from the project connection page. This means each user needs to supply their role information, regardless of whether it's provided on the project connection page. +* In the Snowflake OAuth flow, `role` in the profile config is not optional, as it does not inherit from the project connection config. So each user must supply their role, regardless of whether it is provided in the project connection. + #### Server error 500 -If you experience a 500 server error when redirected from Snowflake to dbt Cloud, double check that you have whitelisted [dbt Cloud's IP addresses](/docs/deploy/regions) on a Snowflake account level. +If you experience a 500 server error when redirected from Snowflake to dbt Cloud, double-check that you have allow listed [dbt Cloud's IP addresses](/docs/cloud/about-cloud/regions-ip-addresses) on a Snowflake account level. + +Enterprise customers who have single-tenant deployments will have a different range of IP addresses (network CIDR ranges) to allow list. -Enterprise customers who have single-tenant deployments will have a different range of IP addresses (network CIDR ranges) to whitelist. +Depending on how you've configured your Snowflake network policies or IP allow listing, you may have to explicitly add the network policy that includes the allow listed dbt Cloud IPs to the security integration you just made. + +``` +ALTER SECURITY INTEGRATION +SET NETWORK_POLICY = ; +``` diff --git a/website/docs/docs/collaborate/manage-access/set-up-sso-azure-active-directory.md b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md similarity index 79% rename from website/docs/docs/collaborate/manage-access/set-up-sso-azure-active-directory.md rename to website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md index 736eba16850..349c3d8ecd7 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-sso-azure-active-directory.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-azure-active-directory.md @@ -1,14 +1,13 @@ --- title: "Set up SSO with Azure Active Directory" +description: "Learn how dbt Cloud administrators can use Azure Active Directory to control access in a dbt Cloud account." id: "set-up-sso-azure-active-directory" sidebar_label: "Set up SSO with Azure AD" --- -:::info Enterprise Feature -This guide describes a feature of the dbt Cloud Enterprise plan. If you’re -interested in learning more about an Enterprise plan, contact us at -sales@getdbt.com. -::: +import SetUpPages from '/snippets/_sso-docs-mt-available.md'; + + dbt Cloud Enterprise supports single-sign on via Azure Active Directory (Azure AD). You will need permissions to create and manage a new Azure AD application. @@ -41,15 +40,14 @@ need to select the appropriate directory and then register a new application. 4. Configure the **Redirect URI**. The table below shows the appropriate Redirect URI values for single-tenant and multi-tenant deployments. For most - enterprise use-cases, you will want to use the single-tenant Redirect URI. + enterprise use-cases, you will want to use the single-tenant Redirect URI. Replace `YOUR_AUTH0_URI` with the [appropriate Auth0 URI](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. | Application Type | Redirect URI | | ----- | ----- | -| Single-Tenant _(recommended)_ | `https://cloud.getdbt.com/complete/azure_single_tenant` | -| Multi-Tenant | `https://cloud.getdbt.com/complete/azure_multi_tenant` | +| Single-Tenant _(recommended)_ | `https://YOUR_AUTH0_URI/login/callback` | +| Multi-Tenant | `https://YOUR_AUTH0_URI/login/callback` | -*Note:* If your dbt account instance is a VPC deployment or is based [outside the US](/docs/deploy/regions), your login URL will use the domain supplied to you by your dbt Labs account team, instead of the domain `cloud.getdbt.com`. 5. Save the App registration to continue setting up Azure AD SSO @@ -115,7 +113,7 @@ Under **Properties** check the toggle setting for **User assignment required?** 16. Under **Manage**, click **Certificates & secrets** 17. Click **+New client secret** 18. Name the client secret "dbt Cloud" (or similar) to identify the secret -19. Select **Never** as the expiration value for this secret +19. Select **730 days (24 months)** as the expiration value for this secret (recommended) 20. Click **Add** to finish creating the client secret value (not the client secret ID) 21. Record the generated client secret somewhere safe. Later in the setup process, we'll use this client secret in dbt Cloud to finish configuring the @@ -139,7 +137,7 @@ To complete setup, follow the steps below in the dbt Cloud application. ### Supplying credentials -24. Go to [Settings](https://cloud.getdbt.com/next/settings/profile). On the left side, select **Single Sign On** under **Account Settings**. +24. Click the gear icon at the top right and select **Profile settings**. To the left, select **Single Sign On** under **Account Settings**. 25. Click the **Edit** button and supply the following SSO details: | Field | Value | @@ -148,8 +146,8 @@ To complete setup, follow the steps below in the dbt Cloud application. | **Client ID** | Paste the **Application (client) ID** recorded in the steps above | | **Client Secret** | Paste the **Client Secret** (remember to use the Secret Value instead of the Secret ID) recorded in the steps above | | **Tenant ID** | Paste the **Directory (tenant ID)** recorded in the steps above | -| **Domain** | Enter the domain name for your Azure directory (eg. `fishtownanalytics.com`). Only users with accounts in this directory with this primary domain will be able to log into the dbt Cloud application. Optionally, you may specify a CSV of domains which are _all_ authorized to access your dbt Cloud account (eg. `fishtownanalytics.com, fishtowndata.com`) Ensure that the domain(s) match the values configured on user accounts in Azure | -| **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://cloud.getdbt.com/enterprise-login/`. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | +| **Domain** | Enter the domain name for your Azure directory (such as `fishtownanalytics.com`). Only use the primary domain; this won't block access for other domains. | +| **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | @@ -158,19 +156,14 @@ To complete setup, follow the steps below in the dbt Cloud application. here, you can navigate to the login URL generated for your account's _slug_ to test logging in with Azure AD. -:::success Logging in -Users in your Azure AD account will now be able to log into the application -by navigating to the URL: + -`https://cloud.getdbt.com/enterprise-login/` -::: -*Note:* If your dbt account instance is a VPC deployment or is [based outside the US](/docs/deploy/regions), your login URL will use the domain supplied to you by your dbt Labs account team, instead of the domain `cloud.getdbt.com`. ## Setting up RBAC Now you have completed setting up SSO with Azure AD, the next steps will be to set up -[RBAC groups](/docs/collaborate/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. ## Troubleshooting Tips diff --git a/website/docs/docs/collaborate/manage-access/set-up-sso-google-workspace.md b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md similarity index 82% rename from website/docs/docs/collaborate/manage-access/set-up-sso-google-workspace.md rename to website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md index 3aba99f2bd0..19779baf615 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-sso-google-workspace.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-google-workspace.md @@ -1,13 +1,12 @@ --- title: "Set up SSO with Google Workspace" +description: "Learn how dbt Cloud administrators can use Single-Sign On (SSO) via Google GSuite to control access in a dbt Cloud account." id: "set-up-sso-google-workspace" --- -:::info Enterprise Feature -This guide describes a feature of the dbt Cloud Enterprise plan. If you’re -interested in learning more about an Enterprise plan, contact us at -sales@getdbt.com. -::: +import SetUpPages from '/snippets/_sso-docs-mt-available.md'; + + dbt Cloud Enterprise supports Single-Sign On (SSO) via Google GSuite. You will need permissions to create and manage a new Google OAuth2 application, as well as @@ -49,28 +48,22 @@ Client Secret for use in dbt Cloud. | ---------------------- | ------------ | ------ | | **Application type** | internal | required | | **Application name** | dbt Cloud | required | -| **Application logo** | Download the logo here | optional | -| **Authorized domains** | `getdbt.com` | If deploying into a VPC, use the domain for your deployment | +| **Application logo** | Download the logo here | optional | +| **Authorized domains** | `getdbt.com` (US multi-tenant) `getdbt.com` and `dbt.com`(US Cell 1) `dbt.com` (EMEA or AU) | If deploying into a VPC, use the domain for your deployment | | **Scopes** | `email, profile, openid` | The default scopes are sufficient | 6. Save the **Consent screen** settings to navigate back to the **Create OAuth client id** page. -7. Use the following configuration values when creating your Credentials: - -:::caution Authorized URIs -If you are deploying dbt Cloud into a VPC, you should use the hostname where -the dbt Cloud application is deployed instead of `https://cloud.getdbt.com` in -the _Authorized Javascript origins_ and _Authorized Redirect URIs_ configurations. -::: +7. Use the following configuration values when creating your Credentials, replacing `YOUR_ACCESS_URL` and `YOUR_AUTH0_URI`, which need to be replaced with the [appropriate Access URL and Auth0 URI](/docs/cloud/manage-access/sso-overview#auth0-multi-tenant-uris) for your region and plan. | Config | Value | | ------ | ----- | | **Application type** | Web application | | **Name** | dbt Cloud | -| **Authorized Javascript origins** | `https://cloud.getdbt.com` | -| **Authorized Redirect URIs** | `https://cloud.getdbt.com/complete/gsuite` | +| **Authorized Javascript origins** | `https://YOUR_ACCESS_URL` | +| **Authorized Redirect URIs** | `https://YOUR_AUTH0_URI/login/callback` | @@ -103,7 +96,7 @@ Settings. account using GSuite auth. Optionally, you may specify a CSV of domains which are _all_ authorized to access your dbt Cloud account (eg. `dbtlabs.com, fishtowndata.com`) - **Slug**: Enter your desired login slug. Users will be able to log into dbt - Cloud by navigating to `https://cloud.getdbt.com/enterprise-login/`. Login slugs must + Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. The `LOGIN-SLUG` must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. @@ -118,13 +111,13 @@ Settings. -If the verification information looks appropriate, then you have completed -the configuration of GSuite SSO. Members of your team should now be able to log -into the dbt Cloud application at `https://cloud.getdbt.com/enterprise-login/`. +If the verification information looks appropriate, then you have completed the configuration of GSuite SSO. + + ## Setting up RBAC Now you have completed setting up SSO with GSuite, the next steps will be to set up -[RBAC groups](/docs/collaborate/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. ## Troubleshooting diff --git a/website/docs/docs/collaborate/manage-access/set-up-sso-okta.md b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md similarity index 75% rename from website/docs/docs/collaborate/manage-access/set-up-sso-okta.md rename to website/docs/docs/cloud/manage-access/set-up-sso-okta.md index 999228dd485..5ec70443d1f 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-sso-okta.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-okta.md @@ -3,11 +3,9 @@ title: "Set up SSO with Okta" id: "set-up-sso-okta" --- -:::info Enterprise Feature +import SetUpPages from '/snippets/_sso-docs-mt-available.md'; -This guide describes a feature of the dbt Cloud Enterprise plan. If you’re interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. - -::: + ## Okta SSO @@ -18,8 +16,6 @@ dbt Cloud Enterprise supports single-sign on via Okta (using SAML). Currently su * Just-in-time provisioning This guide outlines the setup process for authenticating to dbt Cloud with Okta. -If you have any questions during the setup process, please contact support -(support@getdbt.com) for assistance. ## Configuration in Okta @@ -65,22 +61,19 @@ Click **Next** to continue. ### Configure SAML Settings -The SAML Settings page configures how Okta and dbt Cloud communicate. If your -dbt Cloud instance is _not_ running at `cloud.getdbt.com`, you will want to replace -the domain names shown below with the domain name where your instance is running. If you -aren't sure which values you should use, please contact support (support@getdbt.com). +The SAML Settings page configures how Okta and dbt Cloud communicate. You will want to use an [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. To complete this section, you will need a _login slug_. This slug controls the URL where users on your account can log into your application via Okta. Login slugs are typically the lowercased name of your organization separated with -dashes. For example, the _login slug_ for dbt Labs would be +dashes. It should contain only letters, numbers, and dashes. For example, the _login slug_ for dbt Labs would be `dbt-labs`. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. -On the **SAML Settings** page, enter the following values: + -* **Single sign on URL**: `https://cloud.getdbt.com/complete/okta` -* **Audience URI (SP Entity ID)**: `https://cloud.getdbt.com/` +* **Single sign on URL**: `https://YOUR_AUTH0_URI/login/callback?connection=` +* **Audience URI (SP Entity ID)**: `urn:auth0::{login slug}` * **Relay State**: `` `. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | +| **Slug** | Enter your desired login slug. Users will be able to log into dbt Cloud by navigating to `https://YOUR_ACCESS_URL/enterprise-login/LOGIN-SLUG`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. | -`https://cloud.getdbt.com/enterprise-login/` -::: ## Setting up RBAC Now you have completed setting up SSO with Okta, the next steps will be to set up -[RBAC groups](/docs/collaborate/manage-access/enterprise-permissions) to complete your access control configuration. +[RBAC groups](/docs/cloud/manage-access/enterprise-permissions) to complete your access control configuration. diff --git a/website/docs/docs/collaborate/manage-access/set-up-sso-saml-2.0.md b/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md similarity index 71% rename from website/docs/docs/collaborate/manage-access/set-up-sso-saml-2.0.md rename to website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md index 4cb4aba918d..db3efdbeb74 100644 --- a/website/docs/docs/collaborate/manage-access/set-up-sso-saml-2.0.md +++ b/website/docs/docs/cloud/manage-access/set-up-sso-saml-2.0.md @@ -3,12 +3,9 @@ title: "Set up SSO with SAML 2.0" id: "set-up-sso-saml-2.0" --- -:::info Enterprise Feature +import SetUpPages from '/snippets/_sso-docs-mt-available.md'; -This guide describes a feature of the dbt Cloud Enterprise plan. If you’re interested in learning -more about an Enterprise plan, contact us at sales@getdbt.com. - -::: + dbt Cloud Enterprise supports single-sign on (SSO) for any SAML 2.0-compliant identity provider (IdP). Currently supported features include: @@ -17,11 +14,15 @@ Currently supported features include: * Just-in-time provisioning This document details the steps to integrate dbt Cloud with an identity -provider in order to configure Single Sign On and [role-based access control](/docs/collaborate/manage-access/about-access#role-based-access-control). +provider in order to configure Single Sign On and [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control). + +## Auth0 Multi-tenant URIs + + ## Generic SAML 2.0 integrations -You can use the instructions in this section to configure an identity provider that isn't listed below. +If your SAML identity provider is one of Okta, Google, Azure or OneLogin, navigate to the relevant section further down this page. For all other SAML compliant identity providers, you can use the instructions in this section to configure that identity provider. ### Configure your identity provider @@ -38,17 +39,16 @@ You'll need administrator access to your SAML 2.0 compliant identity provider to #### Configuring the application -The following steps assume your dbt Cloud instance is running at `https://cloud.getdbt.com`. If your deployment is running at a different url, then substitute ` cloud.getdbt.com` for the url of your instance. + -To complete this section, you will need to create a login slug. This slug controls the URL where users on your account -can log into your application. Login slugs are typically the lowercased name of your organization +To complete this section, you will need to create a login slug. This slug controls the URL where users on your account can log into your application. Login slugs are typically the lowercased name of your organization. It should contain only letters, numbers, and dashes. separated with dashes. For example, the login slug for dbt Labs would be `dbt-labs`. Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. When prompted for the SAML 2.0 application configurations, supply the following values: -- Single sign on URL: `https://cloud.getdbt.com/complete/saml` -- Audience URI (SP Entity ID): `https://cloud.getdbt.com/` +* Single sign on URL: `https://YOUR_AUTH0_URI/login/callback?connection=` +* Audience URI (SP Entity ID): `urn:auth0::{login slug}` - Relay State: `` Additionally, you may configure the IdP attributes passed from your identity provider into dbt Cloud. We recommend using the following values: @@ -56,11 +56,12 @@ Additionally, you may configure the IdP attributes passed from your identity pro | name | name format | value | description | | ---- | ----------- | ----- | ----------- | -| email | Unspecified | ${user.email} | The user's email address | -| first_name | Unspecified | ${user.first_name} | The user's first name | -| last_name | Unspecified | ${user.last_name} | The user's last name | +| email | Unspecified | user.email | The user's email address | +| first_name | Unspecified | user.first_name | The user's first name | +| last_name | Unspecified | user.last_name | The user's last name | +| NameID (if applicable) | Unspecified | user.email | The user's email address | -dbt Cloud's [role-based access control](/docs/collaborate/manage-access/about-access#role-based-access-control) relies +dbt Cloud's [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control) relies on group mappings from the IdP to assign dbt Cloud users to dbt Cloud groups. To use role-based access control in dbt Cloud, also configure your identity provider to provide group membership information in user attribute called @@ -74,7 +75,6 @@ provider to provide group membership information in user attribute called You may use a restricted group attribute statement to limit the groups set to dbt Cloud for each authenticated user. For example, if all of your dbt Cloud groups start with `DBT_CLOUD_...`, you may optionally apply a filter like `Starts With: DBT_CLOUD_`. -Please contact support if you have any questions. ::: ### Collect integration secrets @@ -117,6 +117,12 @@ You can use the instructions in this section to configure Okta as your identity ### Configure the Okta application + + +To complete this section, you will need to create a login slug. This slug controls the URL where users on your account can log into your application. Login slugs are typically the lowercased name of your organization. It should contain only letters, numbers, and dashes. +separated with dashes. For example, the login slug for dbt Labs would be `dbt-labs`. +Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. + 1. On the **General Settings** page, enter the following details: * **App name**: dbt Cloud @@ -135,8 +141,8 @@ You can use the instructions in this section to configure Okta as your identity 1. On the **SAML Settings** page, enter the following values: - * **Single sign on URL**: `https://cloud.getdbt.com/complete/okta` - * **Audience URI (SP Entity ID)**: `https://cloud.getdbt.com/` + * **Single sign on URL**: `https://YOUR_AUTH0_URI/login/callback?connection=` + * **Audience URI (SP Entity ID)**: `urn:auth0::` * **Relay State**: `` @@ -148,9 +154,9 @@ dbt Cloud expects by using the Attribute Statements and Group Attribute Statemen | Name | Name format | Value | Description | | -------------- | ----------- | -------------------- | -------------------------- | - | `email` | Unspecified | `${user.email}` | _The user's email address_ | - | `first_name` | Unspecified | `${user.firstName}` | _The user's first name_ | - | `last_name` | Unspecified | `${user.lastName}` | _The user's last name_ | + | `email` | Unspecified | `user.email` | _The user's email address_ | + | `first_name` | Unspecified | `user.firstName` | _The user's first name_ | + | `last_name` | Unspecified | `user.lastName` | _The user's last name_ | 4. The following table illustrates expected **Group Attribute Statements**: @@ -214,6 +220,13 @@ Use this section if you are configuring Google as your identity provider. ### Configure the Google application + + +To complete this section, you will need to create a login slug. This slug controls the URL where users on your account +can log into your application. Login slugs are typically the lowercased name of your organization +separated with dashes. It should contain only letters, numbers, and dashes. For example, the login slug for dbt Labs would be `dbt-labs`. +Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. + 1. Sign into your **Google Admin Console** via an account with super administrator privileges. 2. From the Admin console Home page, go to **Apps** and then click **Web and mobile apps**. 3. Click **Add**, then click **Add custom SAML app**. @@ -229,13 +242,14 @@ Use this section if you are configuring Google as your identity provider. 2. Download the **IDP metadata**. 3. Copy the **SSO URL** and **Entity ID** and download the **Certificate** (or **SHA-256 fingerprint**, if needed). 4. Enter the following values on the **Service Provider Details** window: - - **ACS URL**: `https://cloud.getdbt.com/complete/saml` - - **Audience URI (SP Entity ID)**: `https://cloud.getdbt.com/` - - **Start URL**: (if needed) -5. The default **Name ID** is the primary email. Multi-value input is not supported. -6. Use the **Attribute mapping** page to map your organization's Google Directory Attributes to the format that + * **ACS URL**: `https://YOUR_AUTH0_URI/login/callback?connection=` + * **Audience URI (SP Entity ID)**: `urn:auth0::` + - **Start URL**: `` +5. Select the **Signed response** checkbox. +6. The default **Name ID** is the primary email. Multi-value input is not supported. +7. Use the **Attribute mapping** page to map your organization's Google Directory Attributes to the format that dbt Cloud expects. -7. Click **Add another mapping** to map additional attributes. +8. Click **Add another mapping** to map additional attributes. Expected **Attributes**: @@ -245,7 +259,13 @@ Expected **Attributes**: | `Last name` | Unspecified | `last_name` | The user's last name. | | `Primary email`| Unspecified | `email` | The user's email address. | -8. Click **Finish** to continue. +9. To use [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control) in dbt Cloud, enter the groups in the **Group membership** field during configuration: + +| Google groups | App attributes | +| -------------- | -------------- | +| Name of groups | `groups` | + +10. Click **Finish** to continue. ### Finish Google setup @@ -254,7 +274,9 @@ Expected **Attributes**: 2. Select your SAML app. 3. Click **User access**. 4. To turn on or off a service for everyone in your organization, click **On for everyone** or **Off for everyone**, and then click **Save**. -5. Ensure that the email addresses your users use to sign in to the SAML app match the email addresses they use to sign in to your Google domain (Changes typically take effect in minutes, but can take up to 24 hours). +5. Ensure that the email addresses your users use to sign in to the SAML app match the email addresses they use to sign in to your Google domain. + +**Note:** Changes typically take effect in minutes, but can take up to 24 hours. ### Finish setup @@ -266,6 +288,12 @@ If you're using Azure Active Directory (Azure AD), the instructions below will h ### Create Azure AD Enterprise application + + +To complete this section, you will need to create a login slug. This slug controls the URL where users on your account can log into your application. Login slugs are typically the lowercased name of your organization +separated with dashes. It should contain only letters, numbers, and dashes. For example, the login slug for dbt Labs would be `dbt-labs`. +Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. + Follow these steps to set up single sign-on (SSO) with dbt Cloud: 1. Log into your Azure account. @@ -275,7 +303,7 @@ Follow these steps to set up single sign-on (SSO) with dbt Cloud: 5. Select **Integrate any other application you don't find in the gallery (Non-gallery)** as the application type. 6. Click **Create**. 7. You can find the new application by clicking **Enterprise applications** and selecting **All applications**. -8. Click the application you just created and follow the instructions for configuring it in [Configuring SAML endpoints in AD](#configuring-saml-endpoints-in-ad). +8. Click the application you just created. 9. Select **Single sign-on** under Manage in the left navigation. 10. Click **Set up single sign on** under Getting Started. 11. Click **SAML** in "Select a single sign-on method" section. @@ -284,12 +312,12 @@ Follow these steps to set up single sign-on (SSO) with dbt Cloud: | Field | Value | | ----- | ----- | - | **Identifier (Entity ID)** | Base URL for dbt Cloud. Use `https://cloud.getdbt.com/` or `https://YOUR_COMPANY.getdbt.com/` for a single tenant instance. | - | **Reply URL (Assertion Consumer Service URL)** | Use `https://cloud.getdbt.com/complete/saml` or `https://YOUR_COMPANY.getdbt.com/complete/saml` for a single tenant instance. | - | **Relay State** | The slug you will configure in dbt Cloud. It's usually your company name, but you can pick anything you'd like. | + | **Identifier (Entity ID)** | Use `urn:auth0::`. | + | **Reply URL (Assertion Consumer Service URL)** | Use `https://YOUR_AUTH0_URI/login/callback?connection=`. | + | **Relay State** | `` | 14. Click **Save** at the top of the form. -#### Creating SAML Attributes in AD +### Creating SAML settings From the Set up Single Sign-On with SAML page: @@ -311,6 +339,7 @@ From the Set up Single Sign-On with SAML page: **Note:** Keep in mind that the Group ID in Azure AD maps to that group's GUID. It should be specified in lowercase for the mappings to work as expected. The Source Attribute field alternatively can be set to a different value of your preference. +### Finish setup 9. After creating the Azure application, follow the instructions in the [dbt Cloud Setup](#dbt-cloud-setup) section to complete the integration. @@ -323,6 +352,12 @@ To configure OneLogin, you will need **Administrator** access. ### Configure the OneLogin application + + +To complete this section, you will need to create a login slug. This slug controls the URL where users on your account can log into your application. Login slugs are typically the lowercased name of your organization +separated with dashes. It should contain only letters, numbers, and dashes. For example, the login slug for dbt Labs would be `dbt-labs`. +Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company. + 1. Log into OneLogin, and add a new SAML 2.0 Application. 2. Configure the application with the following details: - **Platform:** Web @@ -332,32 +367,25 @@ To configure OneLogin, you will need **Administrator** access. ### Configure SAML settings -The following steps assume your dbt Cloud deployment instance uses `https://cloud.getdbt.com`. If your [deployment](/docs/deploy/regions) uses a different URL, then substitute ` cloud.getdbt.com` with the URL of your instance. - -3. To complete this section, you will need to create a login slug. This slug controls the URL where users on your account -can log into your application. Login slugs are typically the lowercased name of your organization -separated with dashes. For example, the login slug for dbt Labs would be `dbt-labs`. -Login slugs must be unique across all dbt Cloud accounts, so pick a slug that uniquely identifies your company.

-✅ Use: `your-company-name`
-❌ Avoid: `Your-Company-Name` -4. Under the **Configuration tab**, input the following values: +3. Under the **Configuration tab**, input the following values: - **RelayState:** `` - - **Audience (EntityID):** https://cloud.getdbt.com/ - - **ACS (Consumer) URL Validator:** https://cloud.getdbt.com/complete/saml - - **ACS (Consumer) URL:** https://cloud.getdbt.com/complete/saml + - **Audience (EntityID):** `urn:auth0::` + - **ACS (Consumer) URL Validator:** `https://YOUR_AUTH0_URI/login/callback?connection=` + - **ACS (Consumer) URL:** `https://YOUR_AUTH0_URI/login/callback?connection=` -5. Next, go to the **Parameters tab**. You must have a parameter for the Email, First Name, and Last Name attributes and include all parameters in the SAML assertions. When you add the custom parameters, make sure you check the **Include in SAML assertion** checkbox. +4. Next, go to the **Parameters tab**. You must have a parameter for the Email, First Name, and Last Name attributes and include all parameters in the SAML assertions. When you add the custom parameters, make sure you select the **Include in SAML assertion** checkbox. We recommend using the following values: | name | name format | value | | ---- | ----------- | ----- | +| NameID | Unspecified | Email | | email | Unspecified | Email | | first_name | Unspecified | First Name | | last_name | Unspecified | Last Name | -dbt Cloud's [role-based access control](/docs/collaborate/manage-access/about-access#role-based-access-control) relies +dbt Cloud's [role-based access control](/docs/cloud/manage-access/about-user-access#role-based-access-control) relies on group mappings from the IdP to assign dbt Cloud users to dbt Cloud groups. To use role-based access control in dbt Cloud, also configure OneLogin to provide group membership information in user attribute called `groups`: @@ -369,7 +397,7 @@ use role-based access control in dbt Cloud, also configure OneLogin to provide g ### Collect integration secrets -6. After confirming your details, go to the **SSO tab**. OneLogin should show you the following values for +5. After confirming your details, go to the **SSO tab**. OneLogin should show you the following values for the new integration. Keep these values somewhere safe, as you will need them to complete setup in dbt Cloud. - Issuer URL @@ -378,7 +406,7 @@ the new integration. Keep these values somewhere safe, as you will need them to ### Finish setup -7. After creating the OneLogin application, follow the instructions in the [dbt Cloud Setup](#dbt-cloud-setup) +6. After creating the OneLogin application, follow the instructions in the [dbt Cloud Setup](#dbt-cloud-setup) section to complete the integration. ## dbt Cloud Setup @@ -402,15 +430,12 @@ To complete setup, follow the steps below in dbt Cloud: title="Configuring the application in dbt Cloud" /> 4. Click **Save** to complete setup for the SAML 2.0 integration. -5. After completing the setup, you can navigate to the URL generated for your account's _slug_ to -test logging in with your identity provider. Additionally, users added the the SAML 2.0 app -will be able to log in to dbt Cloud from the IdP directly. +5. After completing the setup, you can navigate to the URL generated for your account's _slug_ to test logging in with your identity provider. Additionally, users added the the SAML 2.0 app will be able to log in to dbt Cloud from the IdP directly. -Users in your IdP will now be able to log into the application by navigating to the URL: -`https://cloud.getdbt.com/enterprise-login/` + ### Setting up RBAC After configuring an identity provider, you will be able to set up [role-based -access control](/docs/collaborate/manage-access/enterprise-permissions) for your account. +access control](/docs/cloud/manage-access/enterprise-permissions) for your account. diff --git a/website/docs/docs/collaborate/manage-access/sso-overview.md b/website/docs/docs/cloud/manage-access/sso-overview.md similarity index 86% rename from website/docs/docs/collaborate/manage-access/sso-overview.md rename to website/docs/docs/cloud/manage-access/sso-overview.md index 06fd77d48fe..f613df7907e 100644 --- a/website/docs/docs/collaborate/manage-access/sso-overview.md +++ b/website/docs/docs/cloud/manage-access/sso-overview.md @@ -1,18 +1,26 @@ --- -title: "SSO Overview" +title: "Single sign-on (SSO) Overview" id: "sso-overview" - +pagination_next: "docs/cloud/manage-access/set-up-sso-saml-2.0" +pagination_prev: null --- -:::info Enterprise Feature - -This guide describes a feature of the dbt Cloud Enterprise plan. -If you’re interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. +This overview explains how users are provisioned in dbt Cloud via Single Sign-On (SSO). +dbt Cloud supports JIT (Just-in-Time) provisioning and IdP-initiated login. You can learn more about our supported options [here](https://www.getdbt.com/pricing/). +:::tip Configuring SSO +Once you configure SSO, even partially, you cannot disable or revert it. When you configure it, you will want to make sure you do so completely. ::: -This overview explains how users are provisioned in dbt Cloud via Single Sign-On (SSO). -dbt Cloud supports JIT (Just-in-Time) provisioning and IdP-initiated login. You can learn more about our supported options [here](https://www.getdbt.com/pricing/). +## Prerequisites + +- You have a dbt Cloud account enrolled in the Enterprise plan. [Contact us](mailto:sales@getdbt.com) to learn more and enroll. + +## Auth0 Multi-tenant URIs + + + +## SSO process The diagram below explains the basic process by which users are provisioned in dbt Cloud upon logging in with SSO. @@ -31,21 +39,13 @@ The diagram below explains the basic process by which users are provisioned in d - **No**: If so, create a new entry in the dbt Cloud database for the new user. - **Create dbt Cloud User**: This will create a new entry in the dbt Cloud database for the new user. This user record contains the user's email address, first and last name, and any IdP attributes (e.g. groups) passed along from the Identity Provider. - **Attach Matching Accounts**: dbt Cloud find all of the accounts configured to match the SSO config used by this user to log in, and then create a user license record mapping the user to the account. This step will also delete any licenses that the user should not have based on the current SSO config. -- **Attach Matching Permissions (Groups)**: dbt Cloud iterates through the groups on the matching accounts, and find all that fit one of the below catergories: - - have an SSO mapping group that is assigned to the user - - have the "Assign by Default" option checked. +- **Attach Matching Permissions (Groups)**: dbt Cloud iterates through the groups on the matching accounts, and find all that fit one of the below categories: + - Have an SSO mapping group that is assigned to the user + - Have the "Assign by Default" option checked. Then, assign all of these (and only these) to the user license. This step will also remove any permissions that the user should not have based on the current SSO group mappings. - **dbt Cloud Application**: After these steps, the user is redirected into the dbt Cloud application, and they can begin to use the application normally. -## SSO Enforcement - -:::info Security Update - -Please read the following update if you've enabled SSO but still have non-admin users logging in with a password. The changes outlined here will be released after September 15, 2022. - -::: - -Starting September 15, 2022, we will be making these security changes to SSO to increase the security posture of your environment: +## SSO enforcement * **SSO Enforcement:** If you have SSO turned on in your organization, dbt Cloud will enforce SSO-only logins for all non-admin users. If an Account Admin already has a password, they can continue logging in with a password. * **SSO Re-Authentication:** dbt Cloud will prompt you to re-authenticate using your SSO provider every 24 hours to ensure high security. diff --git a/website/docs/docs/cloud/secure/about-privatelink.md b/website/docs/docs/cloud/secure/about-privatelink.md new file mode 100644 index 00000000000..29003f65a21 --- /dev/null +++ b/website/docs/docs/cloud/secure/about-privatelink.md @@ -0,0 +1,21 @@ +--- +title: "About PrivateLink" +id: about-privatelink +description: "Configuring PrivateLink for AWS" +sidebar_label: "About PrivateLink" +--- + +PrivateLink enables a private connection from any dbt Cloud Multi-Tenant environment to your data platform hosted on AWS using [AWS PrivateLink](https://aws.amazon.com/privatelink/) technology. PrivateLink allows dbt Cloud customers to meet security and compliance controls as it allows connectivity between dbt Cloud and your data platform without traversing the public internet. This feature is supported in most regions across NA, Europe, and Asia, but [contact us](https://www.getdbt.com/contact/) if you have questions about availability. + +### Cross-region PrivateLink + +dbt Labs has a worldwide network of regional VPCs. These VPCs are specifically used to host PrivateLink VPC endpoints, which are connected to dbt Cloud instance environments. To ensure security, access to these endpoints is protected by security groups, network policies, and application connection safeguards. The connected services are also authenticated. Currently, we have multiple customers successfully connecting to their PrivateLink endpoints in different AWS regions within dbt Cloud. + +### Configuring PrivateLink + +dbt Cloud supports the following data platforms for use with the PrivateLink feature. Instructions for enabling PrivateLink for the various data platform providers are unique. The following guides will walk you through the necessary steps, including working with [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support) to complete the connection in the dbt private network and setting up the endpoint in dbt Cloud. + +- [Snowflake](/docs/cloud/secure/snowflake-privatelink) +- [Databricks](/docs/cloud/secure/databricks-privatelink) +- [Redshift](/docs/cloud/secure/redshift-privatelink) +- [Postgres](/docs/cloud/secure/postgres-privatelink) diff --git a/website/docs/docs/cloud/secure/databricks-privatelink.md b/website/docs/docs/cloud/secure/databricks-privatelink.md new file mode 100644 index 00000000000..a2c9e208459 --- /dev/null +++ b/website/docs/docs/cloud/secure/databricks-privatelink.md @@ -0,0 +1,32 @@ +--- +title: "Configuring Databricks PrivateLink" +id: databricks-privatelink +description: "Configuring PrivateLink for Databricks" +sidebar_label: "PrivateLink for Databricks" +pagination_next: null +--- + +The following steps will walk you through the setup of a Databricks AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. + +## Configure PrivateLink + +1. Locate your [Databricks Workspace ID](https://kb.databricks.com/en_US/administration/find-your-workspace-id#:~:text=When%20viewing%20a%20Databricks%20workspace,make%20up%20the%20workspace%20ID) +2. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Databricks +- Databricks workspace name: +- Databricks cluster AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` +3. Once dbt Cloud support has notified you that setup is complete, [register the VPC endpoint in Databricks](https://docs.databricks.com/administration-guide/cloud-configurations/aws/privatelink.html#step-3-register-privatelink-objects-and-attach-them-to-a-workspace) and attach it to the workspace + +## Create Connection in dbt Cloud + +Once you've completed the setup in the Databricks environment, you will be able to configure a private endpoint in dbt Cloud: + +1. Navigate to **Settings** → **Create new project** → select **Databricks**. +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. diff --git a/website/docs/docs/cloud/secure/ip-restrictions.md b/website/docs/docs/cloud/secure/ip-restrictions.md new file mode 100644 index 00000000000..093d2a1c876 --- /dev/null +++ b/website/docs/docs/cloud/secure/ip-restrictions.md @@ -0,0 +1,76 @@ +--- +title: "Configuring IP restrictions" +id: ip-restrictions +description: "Configuring IP restrictions to outside traffic from accessing your dbt Cloud environment" +sidebar_label: "IP restrictions" +pagination_next: "docs/cloud/secure/about-privatelink" +pagination_prev: null +--- + +import SetUpPages from '/snippets/_available-tiers-iprestrictions.md'; + + + +IP Restrictions help control which IP addresses are allowed to connect to dbt Cloud. IP restrictions allow dbt Cloud customers to meet security and compliance controls by only allowing approved IPs to connect to their dbt Cloud environment. This feature is supported in all regions across NA, Europe, and Asia-Pacific, but contact us if you have questions about availability. + +## Configuring IP Restrictions + +To configure IP restrictions, go to **Account Settings** → **IP Restrictions**. IP restrictions provide two methods for determining which IPs can access dbt Cloud: an allowlist and a blocklist. IPs in the allowlist are allowed to access dbt Cloud, and IPs in the deny list will be blocked from accessing dbt Cloud. IP Restrictions can be used for a range of use cases, including: + +- Only allowing corporate VPN traffic and deny all other traffic +- Deny IPs flagged by the Security team +- Allow only VPN traffic but make an exception for contractors’ IP addresses + +IP restrictions will block all service tokens, user requests done via the API (via personal user token), and the UI if they come from blocked IP addresses. + +For any version control system integrations (Github, Gitlab, ADO, etc.) inbound into dbt Cloud, ensure their IP addresses are added to the allowed list. + +### Allowing IPs + +To add an IP to the allowlist, from the **IP Restrictions** page: + +1. Click **edit** +2. Click **Add Rule** +3. Add name and description for the rule + - For example, Corporate VPN CIDR Range +4. Select **Allow** +5. Add the ranges in the CIDR notation + - For example, 1.1.1.1/8 + - You cannot add multiple ranges in the same rule. Instead, create a rule per CIDR range. +6. Click **Save** + +Note that simply adding the IP Ranges will not enforce IP restrictions. For more information, see the section “Enabling Restrictions.” + +If you only want to allow the IP ranges added to this list and deny all other requests, adding a denylist is not necessary. By default, if only an allow list is added, dbt Cloud will only allow IPs in the allowable range and deny all other IPs. However, you can add a denylist if you want to deny specific IP addresses within your allowlist CIDR range. + +### Blocking IPs (deny) + +If you have an IP(s) defined in the allowlist that needs to be denied, you can add those IP ranges to the denylist by doing the following: + +1. Click **Edit** +2. Click **Add Rule** +3. Add name and description for the rule + - For example, "Corporate VPN Deny Range" +4. Select **Deny** +5. Add the ranges or the individual IP addresses in CIDR notation +6. Click **Save** + +:::note Duplicate IP addresses + +If identical IP addresses are in both the allow and block configurations, whichever is entered second will fail to save. + +It is possible to put an IP range on one list and then a sub-range or IP address that is part of it on the other. Using USA (Range) and NY(sub-range) as an example, the expected behavior is: +- USA is on denylist and NY in allowlist - Traffic from the USA will be blocked, but IPs from NY will be allowed. +- USA is on the allowlist, and NY is on the denylist - USA traffic will be allowed, but IPs from NY will be blocked. + +::: + +## Enabling Restrictions + +Once you are done adding all your ranges, IP restrictions can be enabled by selecting the **Enable IP restrictions** button and clicking **Save**. If your IP address is in any of the denylist ranges, you won’t be able to save or enable IP restrictions - this is done to prevent accidental account lockouts. If you do get locked out due to IP changes on your end, please reach out to support@dbtlabs.com + +Once enabled, when someone attempts to access dbt Cloud from a restricted IP, they will encounter one of the following messages depending on whether they use email & password or SSO login. + + + + diff --git a/website/docs/docs/cloud/secure/postgres-privatelink.md b/website/docs/docs/cloud/secure/postgres-privatelink.md new file mode 100644 index 00000000000..482aeb4040d --- /dev/null +++ b/website/docs/docs/cloud/secure/postgres-privatelink.md @@ -0,0 +1,76 @@ +--- +title: "Configure AWS PrivateLink for Postgres" +id: postgres-privatelink +description: "Configuring PrivateLink for Postgres" +sidebar_label: "PrivateLink for Postgres" +--- + +A Postgres database, hosted either in AWS or in a properly connected on-prem data center, can be accessed through a private network connection using AWS Interface-type PrivateLink. The type of Target Group connected to the Network Load Balancer (NLB) may vary based on the location and type of Postgres instance being connected, as explained in the following steps. + +## Configuring Postgres interface-type PrivateLink + +### 1. Provision AWS resources + +Creating an Interface VPC PrivateLink connection requires creating multiple AWS resources in the account containing, or connected to, the Postgres instance: + +- **Security Group (AWS hosted only)** — If you are connecting to an existing Postgres instance, this likely already exists, however, you may need to add or modify Security Group rules to accept traffic from the Network Load Balancer (NLB) created for this Endpoint Service. +- **Target Group** — The Target Group will be attached to the NLB to tell it where to route requests. There are various target types available for NLB Target Groups, so choose the one appropriate for your Postgres setup. + + - Target Type: + + - _[Amazon RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/)_ - **IP** + + - Find the IP address of your RDS instance using a command line tool such as `nslookup ` or `dig +short ` with your RDS DNS endpoint + + - _Note_: With RDS Multi-AZ failover capabilities the IP address of your RDS instance can change, at which point your Target Group would need to be updated. See [this AWS blog post](https://aws.amazon.com/blogs/database/access-amazon-rds-across-vpcs-using-aws-privatelink-and-network-load-balancer/) for more details and a possible solution. + + - _On-prem Postgres server_ - **IP** + + - Use the IP address of the on-prem Postgres server linked to AWS through AWS Direct Connect or a Site-to-Site VPN connection + + - _Postgres on EC2_ - **Instance/ASG** (or **IP**) + + - If your Postgres instance is hosted on EC2 the _instance_ Target Group type (or ideally [using the instance type to connect to an auto-scaling group](https://docs.aws.amazon.com/autoscaling/ec2/userguide/attach-load-balancer-asg.html)) can be used to attach the instance without needing a static IP address + + - The IP type can also be used, with the understanding that the IP of the EC2 instance can change if the instance is relaunched for any reason + + - Target Group protocol: **TCP** + +- **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5432` +- **VPC Endpoint Service** — Attach to the newly created NLB. + - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + +### 2. Grant dbt AWS account access to the VPC Endpoint Service + +On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. + + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` + + + +### 3. Obtain VPC Endpoint Service Name + +Once the VPC Endpoint Service is provisioned, you can find the service name in the AWS console by navigating to **VPC** → **Endpoint Services** and selecting the appropriate endpoint service. You can copy the service name field value and include it in your communication to dbt Cloud support. + + + +### 4. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Postgres Interface-type +- VPC Endpoint Service Name: +- Postgres server AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. + +## Create Connection in dbt Cloud + +Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. + +1. Navigate to **settings** → **Create new project** → select **PostgreSQL** +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. diff --git a/website/docs/docs/cloud/secure/redshift-privatelink.md b/website/docs/docs/cloud/secure/redshift-privatelink.md new file mode 100644 index 00000000000..3ed49e7bb34 --- /dev/null +++ b/website/docs/docs/cloud/secure/redshift-privatelink.md @@ -0,0 +1,111 @@ +--- +title: "Configure AWS PrivateLink for Redshift" +id: redshift-privatelink +description: "Configuring PrivateLink for Redshift" +sidebar_label: "PrivateLink for Redshift" +--- + +AWS provides two different ways to create a PrivateLink VPC endpoint for a Redshift cluster that is running in another VPC: +- [Redshift-managed PrivateLink Endpoints](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html) +- [Redshift Interface-type PrivateLink Endpoints](https://docs.aws.amazon.com/redshift/latest/mgmt/security-private-link.html) + +dbt Cloud supports both types of endpoints, but there are a number of [considerations](https://docs.aws.amazon.com/redshift/latest/mgmt/managing-cluster-cross-vpc.html#managing-cluster-cross-vpc-considerations) to take into account when deciding which endpoint type to use. Redshift-managed provides a far simpler setup with no additional cost, which might make it the preferred option for many, but may not be an option in all environments. Based on these criteria, you will need to determine which is the right type for your system. Follow the instructions from the section below that corresponds to your chosen endpoint type. + +:::note Redshift Serverless +While Redshift Serverless does support Redshift-managed type VPC endpoints, this functionality is not currently available across AWS accounts. Due to this limitation, an Interface-type VPC endpoint service must be used for Redshift Serverless cluster PrivateLink connectivity from dbt Cloud. +::: + +## Configuring Redshift-managed PrivateLink + +1. On the running Redshift cluster, select the **Properties** tab. + + + +2. In the **Granted accounts** section, click **Grant access**. + + + +3. Enter the AWS account ID: `346425330055` - _NOTE: This account ID only applies to dbt Cloud Multi-Tenant environments. For Virtual Private/Single-Tenant account IDs please contact [Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support)._ + +4. Choose **Grant access to all VPCs** —or— (optional) contact [Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support) for the appropriate regional VPC ID to designate in the **Grant access to specific VPCs** field. + + + +5. Add the required information to the following template, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): + +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Redshift-managed +- Redshift cluster name: +- Redshift cluster AWS account ID: +- Redshift cluster AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the service is available. + +## Configuring Redshift Interface-type PrivateLink + +### 1. Provision AWS Resources + +Creating an Interface VPC PrivateLink connection requires creating multiple AWS resources in the account containing the Redshift cluster: + +- **Security Group** — If you are connecting to an existing Redshift cluster, this likely already exists, however, you may need to add or modify Security Group rules to accept traffic from the Network Load Balancer (NLB) created for this Endpoint Service. +- **Target Group** — The Target Group will be attached to the NLB to tell it where to route requests. There are various target types available for NLB Target Groups, but you will use the IP address type. + + - Target Type: **IP** + + - **Standard Redshift** + + - Use IP addresses from the Redshift cluster’s **Network Interfaces** whenever possible. While IPs listed in the **Node IP addresses** section will work, they are also more likely to change. + + + - There will likely be only one Network Interface (NI) to start, but if the cluster fails over to another availability zone (AZ), a new NI will also be created for that AZ. The NI IP from the original AZ will still work, but the new NI IP can also be added to the Target Group. If adding additional IPs, note that the NLB will also need to add the corresponding AZ. Once created, the NI(s) should stay the same (This is our observation from testing, but AWS does not officially document it). + + - **Redshift Serverless** + + - To find the IP addresses for Redshift Serverless instance locate and copy the endpoint (only the URL listed before the port) in the Workgroup configuration section of the AWS console for the instance. + + + - From a command line run the command `nslookup ` using the endpoint found in the previous step and use the associated IP(s) for the Target Group. + + - Target Group protocol: **TCP** + +- **Network Load Balancer (NLB)** — Requires creating a Listener that attaches to the newly created Target Group for port `5439` +- **VPC Endpoint Service** — Attach to the newly created NLB. + - Acceptance required (optional) — Requires you to [accept our connection request](https://docs.aws.amazon.com/vpc/latest/privatelink/configure-endpoint-service.html#accept-reject-connection-requests) after dbt creates the endpoint. + +### 2. Grant dbt AWS Account access to the VPC Endpoint Service + +On the provisioned VPC endpoint service, click the **Allow principals** tab. Click **Allow principals** to grant access. Enter the ARN of the root user in the appropriate production AWS account and save your changes. + + - Principal: `arn:aws:iam::346425330055:role/MTPL_Admin` + + + +### 3. Obtain VPC Endpoint Service Name + +Once the VPC Endpoint Service is provisioned, you can find the service name in the AWS console by navigating to **VPC** → **Endpoint Services** and selecting the appropriate endpoint service. You can copy the service name field value and include it in your communication to dbt Cloud support. + + + +### 4. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Redshift Interface-type +- VPC Endpoint Service Name: +- Redshift cluster AWS Region (e.g., us-east-1, eu-west-2): +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. + +## Create Connection in dbt Cloud + +Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. + +1. Navigate to **settings** → **Create new project** → select **Redshift** +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. diff --git a/website/docs/docs/cloud/secure/secure-your-tenant.md b/website/docs/docs/cloud/secure/secure-your-tenant.md new file mode 100644 index 00000000000..95cb8adffba --- /dev/null +++ b/website/docs/docs/cloud/secure/secure-your-tenant.md @@ -0,0 +1,49 @@ +--- +title: "Secure your tenant" +description: "Learn how to secure your tenant for dbt Cloud" +pagination_next: "docs/cloud/secure/ip-restrictions" +pagination_prev: null +--- + +
+ + + + + + + +
+
+
+ + + + + + +
\ No newline at end of file diff --git a/website/docs/docs/cloud/secure/snowflake-privatelink.md b/website/docs/docs/cloud/secure/snowflake-privatelink.md new file mode 100644 index 00000000000..bbbdf04ddf0 --- /dev/null +++ b/website/docs/docs/cloud/secure/snowflake-privatelink.md @@ -0,0 +1,51 @@ +--- +title: "Configuring Snowflake PrivateLink" +id: snowflake-privatelink +description: "Configuring PrivateLink for Snowflake" +sidebar_label: "PrivateLink for Snowflake" +--- + +The following steps will walk you through the setup of a Snowflake AWS PrivateLink endpoint in the dbt Cloud multi-tenant environment. + +:::note Snowflake SSO with PrivateLink +Users connecting to Snowflake using SSO over a PrivateLink connection from dbt Cloud will also require access to a PrivateLink endpoint from their local workstation. + +>Currently, for any given Snowflake account, SSO works with only one account URL at a time: either the public account URL or the URL associated with the private connectivity service. + +- [Snowflake SSO with Private Connectivity](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-overview#label-sso-private-connectivity) +::: + +## Configure PrivateLink + +1. Open a Support case with Snowflake to allow access from the dbt Cloud AWS account +- Snowflake prefers that the account owner opens the Support case directly, rather than dbt Labs acting on their behalf. For more information, refer to [Snowflake's knowledge base article](https://community.snowflake.com/s/article/HowtosetupPrivatelinktoSnowflakefromCloudServiceVendors) +- Provide them with your dbt Cloud account ID along with any other information requested in the article. + - AWS account ID: `346425330055` - _NOTE: This account ID only applies to dbt Cloud Multi-Tenant environments. For Virtual Private/Single-Tenant account IDs please contact [Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support)._ +- You will need to have `ACCOUNTADMIN` access to the Snowflake instance to submit a Support request. + + + +2. After Snowflake has granted the requested access, run the Snowflake system function [SYSTEM$GET_PRIVATELINK_CONFIG](https://docs.snowflake.com/en/sql-reference/functions/system_get_privatelink_config.html) and copy the output. + +3. Add the required information to the template below, and submit your request to [dbt Support](https://docs.getdbt.com/community/resources/getting-help#dbt-cloud-support): + +``` +Subject: New Multi-Tenant PrivateLink Request +- Type: Snowflake +- SYSTEM$GET_PRIVATELINK_CONFIG output: +- *Use privatelink-account-url or regionless-privatelink-account-url?: +- dbt Cloud multi-tenant environment (US, EMEA, AU): +``` +_*By default dbt Cloud will be configured to use `privatelink-account-url` from the provided [SYSTEM$GET_PRIVATELINK_CONFIG](https://docs.snowflake.com/en/sql-reference/functions/system_get_privatelink_config.html) as the PrivateLink endpoint. Upon request, `regionless-privatelink-account-url` can be used instead._ + +dbt Labs will work on your behalf to complete the PrivateLink setup. Please allow 1-2 business days for this process to complete. Support will contact you when the endpoint is available. + +## Create Connection in dbt Cloud + +Once dbt Cloud support completes the configuration, you can start creating new connections using PrivateLink. + +1. Navigate to **Settings** → **Create new project** → select **Snowflake**. +2. You will see two radio buttons: **Public** and **Private.** Select **Private**. +3. Select the private endpoint from the dropdown (this will automatically populate the hostname/account field). +4. Configure the remaining data platform details. +5. Test your connection and save it. \ No newline at end of file diff --git a/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md b/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md index d8fb539feca..a54b8fcdc2b 100644 --- a/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md +++ b/website/docs/docs/collaborate/cloud-build-and-view-your-docs.md @@ -2,6 +2,7 @@ title: "Build and view your docs with dbt Cloud" id: "build-and-view-your-docs" description: "Automatically generate project documentation as you run jobs." +pagination_next: null --- dbt enables you to generate documentation for your project and data warehouse, and renders the documentation in a website. For more information, see [Documentation](/docs/collaborate/documentation). @@ -19,6 +20,15 @@ To set up a job to generate docs: 4. Click **Save**. Proceed to [configure project documentation](#configure-project-documentation) so your project generates the documentation when this job runs. +You can also add `dbt docs generate` to the list of commands in the job run steps. However, you can expect different outcomes when adding the command to the run steps compared to configuring a job selecting the **Generate docs on run** checkbox (shown in previous steps). + +Review the following options and outcomes: + +| Options | Outcomes | +|--------| ------- | +| **Select checkbox** | Select the **Generate docs on run** checkbox to automatically generate updated project docs each time your job runs. If that particular step in your job fails, the job can still be successful if all subsequent steps are successful. | +| **Add as a run step** | Add `dbt docs generate` to the list of commands in the job run steps, in whatever order you prefer. If that particular step in your job fails, the job will fail and all subsequent steps will be skipped. | + :::tip Tip — Documentation-only jobs To create and schedule documentation-only jobs at the end of your production jobs, add the `dbt compile` command in the **Commands** section. diff --git a/website/docs/docs/collaborate/collaborate-with-others.md b/website/docs/docs/collaborate/collaborate-with-others.md new file mode 100644 index 00000000000..7875a8044b6 --- /dev/null +++ b/website/docs/docs/collaborate/collaborate-with-others.md @@ -0,0 +1,38 @@ +--- +title: "Collaborate with others" +description: "Learn how dbt Cloud makes it easier to collaborate with others" +pagination_next: "docs/collaborate/explore-projects" +pagination_prev: null +--- + +
+ + + + + +
+
+
+ + + + + +
\ No newline at end of file diff --git a/website/docs/docs/collaborate/documentation.md b/website/docs/docs/collaborate/documentation.md index dc9a3a6c848..0fa00c7cca2 100644 --- a/website/docs/docs/collaborate/documentation.md +++ b/website/docs/docs/collaborate/documentation.md @@ -1,14 +1,17 @@ --- title: "About documentation" +description: "Learn how good documentation for your dbt models helps stakeholders discover and understand your datasets." id: "documentation" +pagination_next: "docs/collaborate/build-and-view-your-docs" +pagination_prev: null --- ## Related documentation -* [Declaring properties](configs-and-properties) -* [`dbt docs` command](cmd-docs) +* [Declaring properties](/reference/configs-and-properties) +* [`dbt docs` command](/reference/commands/cmd-docs) * [`doc` Jinja function](/reference/dbt-jinja-functions) -* If you're new to dbt, we recommend that you check out our [Getting Started Tutorial](/docs/get-started/getting-started/overview) to build your first dbt project, complete with documentation. +* If you're new to dbt, we recommend that you check out our [quickstart guide](/quickstarts) to build your first dbt project, complete with documentation. ## Assumed knowledge @@ -66,17 +69,17 @@ First, run `dbt docs generate` — this command tells dbt to compile relevant in Then, run `dbt docs serve` to use these `.json` files to populate a local website. ## FAQs - - - - - - + + + + + + ## Using Docs Blocks ### Syntax -To declare a docs block, use the jinja `docs` tag. Docs blocks must be uniquely named, and can contain arbitrary markdown. In practice, a docs block might look like this: +To declare a docs block, use the jinja `docs` tag. Docs blocks can contain arbitrary markdown, but they must be uniquely named. Their names may contain uppercase and lowercase letters (A-Z, a-z), digits (0-9), and underscores (_), but can't start with a digit. @@ -96,14 +99,14 @@ The events in this table are recorded by [Snowplow](http://github.com/snowplow/s -In the above example, a docs block named `table_events` is defined with some descriptive markdown contents. There is nothing significant about the name `table_events` — docs blocks can be named however you like, as long as the name only contains alphanumeric and underscore characters. +In the above example, a docs block named `table_events` is defined with some descriptive markdown contents. There is nothing significant about the name `table_events` — docs blocks can be named however you like, as long as the name only contains alphanumeric and underscore characters and does not start with a numeric character. ### Placement -Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](model-paths), [seed-paths](seed-paths), [analysis-paths](analysis-paths), [macro-paths](macro-paths) and [snapshot-paths](snapshot-paths)) — you can adjust this behavior using the [docs-paths](docs-paths) config. +Docs blocks should be placed in files with a `.md` file extension. By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths) and [snapshot-paths](/reference/project-configs/snapshot-paths)) — you can adjust this behavior using the [docs-paths](/reference/project-configs/docs-paths) config. ### Usage -To use a docs block, reference it from your `schema.yml` file with the [doc()](doc) function in place of a markdown string. Using the examples above, the `table_events` docs can be included in the `schema.yml` file as shown below: +To use a docs block, reference it from your `schema.yml` file with the [doc()](/reference/dbt-jinja-functions/doc) function in place of a markdown string. Using the examples above, the `table_events` docs can be included in the `schema.yml` file as shown below: @@ -146,7 +149,6 @@ as well as the repo for this project \[here](https://github.com/dbt-labs/mrr-pla ### Custom project-level overviews -New in v0.18.0 You can set different overviews for each dbt project/package included in your documentation site by creating a docs block named `__[project_name]__`. For example, in order to define @@ -183,7 +185,7 @@ From a docs page, you can click the green button in the bottom-right corner of t -In this example, the `fct_subscription_transactions` model only has one direct parent. By clicking the "Expand" button in the top-right corner of the window, we can pivot the graph horizontally and view the full lineage for our model. This lineage is filterable using the `--select` and `--exclude` flags, which are consistent with the semantics of [model selection syntax](node-selection/syntax). Further, you can right-click to interact with the DAG, jump to documentation, or share links to your graph visualization with your coworkers. +In this example, the `fct_subscription_transactions` model only has one direct parent. By clicking the "Expand" button in the top-right corner of the window, we can pivot the graph horizontally and view the full lineage for our model. This lineage is filterable using the `--select` and `--exclude` flags, which are consistent with the semantics of [model selection syntax](/reference/node-selection/syntax). Further, you can right-click to interact with the DAG, jump to documentation, or share links to your graph visualization with your coworkers. diff --git a/website/docs/docs/collaborate/environments.md b/website/docs/docs/collaborate/environments.md deleted file mode 100644 index c611056c9e1..00000000000 --- a/website/docs/docs/collaborate/environments.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: "Environments" -id: "environments" ---- - -## What are environments? -In software engineering, environments are used to enable engineers to develop and test code without impacting the users of their software. - -“Production” (or _prod_) refers to the environment that end users interact with, while “development” (or _dev_) is the environment that engineers write code in. This means that engineers can work iteratively when writing and testing new code in _development_, and once they are confident in these changes, deploy their code to _production_. - -In traditional software engineering, different environments often use completely separate architecture. For example, the dev and prod versions of a website may use different servers and databases. - -Data warehouses can also be designed to have separate environments – the _production_ environment refers to the relations (i.e. schemas, tables, and views) that your end users query (often through a BI tool). - -## How do I maintain different environments with dbt? -dbt makes it easy to maintain separate production and development environments through the use of targets within a profile. A typical profile when using dbt locally (i.e. running from your command line) will have a target named `dev`, and have this set as the default. This means that while making changes, your objects will be built in your _development_ target, without affecting production queries made by your end users. Once you are confident in your changes, you can deploy the code to _production_, by running your dbt project with a _prod_ target. - -:::info Running dbt in production - -You can learn more about different ways to run dbt in production in [this article](/docs/deploy/deployments). - -::: - -Targets offer the flexibility to decide how to implement your separate environments – whether you want to use separate schemas, databases, or entirely different clusters altogether! We recommend using _different schemas within one data warehouse_ to separate your environments. This is the easiest to set up, and is the most cost effective solution in a modern cloud-based data stack. - -In practice, this means that most of the details in a target will be consistent across all targets, except for the `schema` and user credentials. If you have multiple dbt users writing code, it often makes sense for _each user_ to have their own _development_ environment. A pattern we've found useful is to set your dev target schema to be `dbt_`. User credentials should also differ across targets so that each dbt user is using their own data warehouse user. - -## Related docs -- [About dbt Core versions](/docs/dbt-versions/core) -- [Upgrade Core version in Cloud](/docs/dbt-versions/upgrade-core-in-cloud) diff --git a/website/docs/docs/collaborate/explore-projects.md b/website/docs/docs/collaborate/explore-projects.md new file mode 100644 index 00000000000..b041cd0c915 --- /dev/null +++ b/website/docs/docs/collaborate/explore-projects.md @@ -0,0 +1,236 @@ +--- +title: "Explore your dbt projects" +sidebar_label: "Explore dbt projects" +description: "Learn about dbt Explorer and how to interact with it to understand, improve, and leverage your data pipelines." +pagination_next: null +pagination_prev: null +--- + +With dbt Explorer, you can view your project's [resources](/docs/build/projects) (such as models, tests, and metrics) and their lineage to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. + +:::tip Public preview + +Try dbt Explorer! It's available in [Public Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud) as of October 17, 2023 for dbt Cloud customers. More updates coming soon. + +::: + +## Prerequisites + +- You have a [multi-tenant](/docs/cloud/about-cloud/tenancy#multi-tenant) or AWS single-tenant dbt Cloud account on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You have set up a [production deployment environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) for each project you want to explore. + - There has been at least one successful job run in the production deployment environment. +- You are on the dbt Explorer page. To do this, select **Explore** from the top navigation bar in dbt Cloud. + + +## Generate metadata + +dbt Explorer uses the metadata provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to display the details about [the state of your project](/docs/dbt-cloud-apis/project-state). The metadata that's available depends on the [deployment environment](/docs/deploy/deploy-environments) you've designated as _production_ in your dbt Cloud project. dbt Explorer automatically retrieves the metadata updates after each job run in the production deployment environment so it always has the latest results for your project. + +To view a resource and its metadata, you must define the resource in your project and run a job in the production environment. The resulting metadata depends on the [commands executed by the jobs](/docs/deploy/job-commands). + +For a richer experience with dbt Explorer, you must: + +- Run [dbt run](/reference/commands/run) or [dbt build](/reference/commands/build) on a given model within a job in the environment to update model details or results. +- Run [dbt docs generate](/reference/commands/cmd-docs) within a job in the environment to view catalog statistics and columns for models, sources, and snapshots. +- Run [dbt test](/reference/commands/test) or [dbt build](/reference/commands/build) within a job in the environment to view test results. +- Run [dbt source freshness](/reference/commands/source#dbt-source-freshness) within a job in the environment to view source freshness data. +- Run [dbt snapshot](/reference/commands/snapshot) or [dbt build](/reference/commands/build) within a job in the environment to view snapshot details. + +Richer and more timely metadata will become available as dbt, the Discovery API, and the underlying dbt Cloud platform evolves. + +## Explore your project's lineage graph {#project-lineage} + +dbt Explorer provides a visualization of your project’s DAG that you can interact with. To access the project's full lineage graph, select **Overview** in the left sidebar and click the **Explore Lineage** button on the main (center) section of the page. + +If you don't see the project lineage graph immediately, click **Render Lineage**. It can take some time for the graph to render depending on the size of your project and your computer’s available memory. The graph of very large projects might not render so you can select a subset of nodes by using selectors, instead. + +The nodes in the lineage graph represent the project’s resources and the edges represent the relationships between the nodes. Nodes are color-coded and include iconography according to their resource type. + +To explore the lineage graphs of tests and macros, view [their resource details pages](#view-resource-details). By default, dbt Explorer excludes these resources from the full lineage graph unless a search query returns them as results. + +To interact with the full lineage graph, you can: + +- Hover over any item in the graph to display the resource’s name and type. +- Zoom in and out on the graph by mouse-scrolling. +- Grab and move the graph and the nodes. +- Select a resource to highlight its relationship with other resources in your project. A panel opens on the graph’s right-hand side that displays a high-level summary of the resource’s details. The side panel includes a **General** tab for information like description, materialized type, and other details. + - Click the Share icon in the side panel to copy the graph’s link to your clipboard. + - Click the View Resource icon in the side panel to [view the resource details](#view-resource-details). +- [Search and select specific resources](#search-resources) or a subset of the DAG using selectors and graph operators. For example: + - `+[RESOURCE_NAME]` — Displays all parent nodes of the resource + - `resource_type:model [RESOURCE_NAME]` — Displays all models matching the name search + +- [View resource details](#view-resource-details) by selecting a node (double-clicking) in the graph. +- Click the List view icon in the graph's upper right corner to return to the main **Explore** page. + + + + +## Search for resources {#search-resources} +With the search bar (on the upper left corner of the page or in a lineage graph), you can search with keywords or by using [node selection syntax](/reference/node-selection/syntax). The resources that match your search criteria will display as a lineage graph and a table in the main section of the page. + +Select a node (single-click) in the lineage graph to highlight its relationship with your other search results and to display which project contains the resource's definition. When you choose a node (double-click) in the lineage graph or when you select a resource in the table, dbt Explorer displays the [resource's details page](#view-resource-details). + +### Search with keywords +When searching with keywords, dbt Explorer searches through your resource metadata (such as resource type, resource name, column name, source name, tags, schema, database, version, alias/identifier, and package name) and returns any matches. + +### Search with selector methods + +You can search with [selector methods](/reference/node-selection/methods). Below are the selectors currently available in dbt Explorer: + +- `fqn:` — Find resources by [file or fully qualified name](/reference/node-selection/methods#the-fqn-method). This selector is the search bar's default. If you want to use the default, it's unnecessary to add `fqn:` before the search term. +- `source:` — Find resources by a specified [source](/reference/node-selection/methods#the-source-method). +- `resource_type:` — Find resources by their [type](/reference/node-selection/methods#the-resource_type-method). +- `package:` — Find resources by the [dbt package](/reference/node-selection/methods#the-package-method) that defines them. +- `tag:` — Find resources by a specified [tag](/reference/node-selection/methods#the-tag-method). + + + +- `group:` — Find models defined within a specified [group](/reference/node-selection/methods#the-group-method). +- `access:` — Find models based on their [access](/reference/node-selection/methods#the-access-method) property. + + + +### Search with graph operators + +You can use [graph operators](/reference/node-selection/graph-operators) on keywords or selector methods. For example, `+orders` returns all the parents of `orders`. + +### Search with set operators + +You can use multiple selector methods in your search query with [set operators](/reference/node-selection/set-operators). A space implies a union set operator and a comma for an intersection. For example: +- `resource_type:metric,tag:nightly` — Returns metrics with the tag `nightly` +- `+snowplow_sessions +fct_orders` — Returns resources that are parent nodes of either `snowplow_sessions` or `fct_orders` + +### Search with both keywords and selector methods + +You can use keyword search to highlight results that are filtered by the selector search. For example, if you don't have a resource called `customers`, then `resource_type:metric customers` returns all the metrics in your project and highlights those that are related to the term `customers` in the name, in a column, tagged as customers, and so on. + +When searching in this way, the selectors behave as filters that you can use to narrow the search and keywords as a way to find matches within those filtered results. + + + +## Browse with the sidebar + +By default, the catalog sidebar lists all your project’s resources. Select any resource type in the list and all those resources in the project will display as a table in the main section of the page. For a description on the different resource types (like models, metrics, and so on), refer to [About dbt projects](/docs/build/projects). + +To browse using a different view, you can choose one of these options from the **View by** dropdown: + +- **Resources** (default) — All resources in the project organized by type. +- **Packages** — All resources in the project organized by the dbt package in which they are defined. +- **File Tree** — All resources in the project organized by the file in which they are defined. This mirrors the file tree in your dbt project repository. +- **Database** — All resources in the project organized by the database and schema in which they are built. This mirrors your data platform's structure that represents the [applied state](/docs/dbt-cloud-apis/project-state) of your project. + + + +## View model versions + +If models in the project are versioned, you can see which [version of the model](/docs/collaborate/govern/model-versions) is being applied — `prerelease`, `latest`, and `old` — in the title of the model’s details page and in the model list from the sidebar. + +## View resource details {#view-resource-details} +You can view the definition and latest run results of any resource in your project. To find a resource and view its details, you can interact with the lineage graph, use search, or browse the catalog. + +The details (metadata) available to you depends on the resource’s type, its definition, and the [commands](/docs/deploy/job-commands) that run within jobs in the production environment. + + + + +### Example of model details + +An example of the details you might get for a model: + +- Status bar (below the page title) — Information on the last time the model ran, whether the run was successful, how the data is materialized, number of rows, and the size of the model. +- **General** tab includes: + - **Lineage** graph — The model’s lineage graph that you can interact with. The graph includes one parent node and one child node from the model. Click the Expand icon in the graph's upper right corner to view the model in full lineage graph mode. + - **Description** section — A [description of the model](/docs/collaborate/documentation#adding-descriptions-to-your-project). + - **Recent** section — Information on the last time the model ran, how long it ran for, whether the run was successful, the job ID, and the run ID. + - **Tests** section — [Tests](/docs/build/tests) for the model. + - **Details** section — Key properties like the model’s relation name (for example, how it’s represented and how you can query it in the data platform: `database.schema.identifier`); model governance attributes like access, group, and if contracted; and more. + - **Relationships** section — The nodes the model **Depends On**, is **Referenced by**, and (if applicable) is **Used by** for projects that have declared the models' project as a dependency. +- **Code** tab — The source code and compiled code for the model. +- **Columns** tab — The available columns in the model. This tab also shows tests results (if any) that you can select to view the test's details page. A :white_check_mark: denotes a passing test. + + +### Example of exposure details + +An example of the details you might get for an exposure: + +- Status bar (below the page title) — Information on the last time the exposure was updated. +- **General** tab includes: + - **Status** section — The status on data freshness and data quality. + - **Lineage** graph — The exposure’s lineage graph. Click the Expand icon in the graph's upper right corner to view the exposure in full lineage graph mode. + - **Description** section — A description of the exposure. + - **Details** section — Details like exposure type, maturity, owner information, and more. + - **Relationships** section — The nodes the exposure **Depends On**. + +### Example of test details + +An example of the details you might get for a test: + +- Status bar (below the page title) — Information on the last time the test ran, whether the test passed, test name, test target, and column name. +- **General** tab includes: + - **Lineage** graph — The test’s lineage graph that you can interact with. The graph includes one parent node and one child node from the test resource. Click the Expand icon in the graph's upper right corner to view the test in full lineage graph mode. + - **Description** section — A description of the test. + - **Recent** section — Information on the last time the test ran, how long it ran for, whether the test passed, the job ID, and the run ID. + - **Details** section — Details like schema, severity, package, and more. + - **Relationships** section — The nodes the test **Depends On**. +- **Code** tab — The source code and compiled code for the test. + + +### Example of source details + +An example of the details you might get for each source table within a source collection: + +- Status bar (below the page title) — Information on the last time the source was updated and the number of tables the source uses. +- **General** tab includes: + - **Lineage** graph — The source’s lineage graph that you can interact with. The graph includes one parent node and one child node from the source. Click the Expand icon in the graph's upper right corner to view the source in full lineage graph mode. + - **Description** section — A description of the source. + - **Source freshness** section — Information on whether refreshing the data was successful, the last time the source was loaded, the timestamp of when a run generated data, and the run ID. + - **Details** section — Details like database, schema, and more. + - **Relationships** section — A table that lists all the sources used with their freshness status, the timestamp of when freshness was last checked, and the timestamp of when the source was last loaded. +- **Columns** tab — The available columns in the source. This tab also shows tests results (if any) that you can select to view the test's details page. A :white_check_mark: denotes a passing test. + +## About project-level lineage +You can also view all the different projects and public models in the account, where the public models are defined, and how they are used to gain a better understanding about your cross-project resources. + +When viewing the resource-level lineage graph for a given project that uses cross-project references, you can see cross-project relationships represented in the DAG. The iconography is slightly different depending on whether you're viewing the lineage of an upstream producer project or a downstream consumer project. + +When viewing an upstream (parent) project that produces public models that are imported by downstream (child) projects, public models will have a counter icon in their upper right corner that indicates the number of projects that declare the current project as a dependency. Selecting that model reveals the lineage to show the specific projects that are dependent on this model. Projects show up in this counter if they declare the parent project as a dependency in its `dependencies.yml` regardless of whether or not there's a direct `{{ ref() }}` against the public model. Selecting a project node from a public model opens the resource-level lineage graph for that project, which is subject to your permissions. + + + +When viewing a downstream (child) project that imports and refs public models from upstream (parent) projects, public models will show up in the lineage graph and display an icon on the graph edge that indicates what the relationship is to a model from another project. Hovering over this icon indicates the specific dbt Cloud project that produces that model. Double-clicking on a model from another project opens the resource-level lineage graph of the parent project, which is subject to your permissions. + + + + +### Explore the project-level lineage graph + +For cross-project collaboration, you can interact with the DAG in all the same ways as described in [Explore your project's lineage](#project-lineage) but you can also interact with it at the project level and view the details. + +To get a list view of all the projects, select the account name at the top of the **Explore** page near the navigation bar. This view includes a public model list, project list, and a search bar for project searches. You can also view the project-level lineage graph by clicking the Lineage view icon in the page's upper right corner. + +If you have permissions for a project in the account, you can view all public models used across the entire account. However, you can only view full public model details and private models if you have permissions for a project where the models are defined. + +From the project-level lineage graph, you can: + +- Click the Lineage view icon (in the graph’s upper right corner) to view the cross-project lineage graph. +- Click the List view icon (in the graph’s upper right corner) to view the project list. + - Select a project from the **Projects** tab to switch to that project’s main **Explore** page. + - Select a model from the **Public Models** tab to view the [model’s details page](#view-resource-details). + - Perform searches on your projects with the search bar. +- Select a project node in the graph (double-clicking) to switch to that particular project’s lineage graph. + +When you select a project node in the graph, a project details panel opens on the graph’s right-hand side where you can: + +- View counts of the resources defined in the project. +- View a list of its public models, if any. +- View a list of other projects that uses the project, if any. +- Click **Open Project Lineage** to switch to the project’s lineage graph. +- Click the Share icon to copy the project panel link to your clipboard so you can share the graph with someone. + + + +## Related content +- [Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions) +- [About model governance](/docs/collaborate/govern/about-model-governance) +- [What is data mesh?](https://www.getdbt.com/blog/what-is-data-mesh-the-definition-and-importance-of-data-mesh) blog diff --git a/website/docs/docs/collaborate/git-version-control.md b/website/docs/docs/collaborate/git-version-control.md index 7006b8836f2..392e2c3baa5 100644 --- a/website/docs/docs/collaborate/git-version-control.md +++ b/website/docs/docs/collaborate/git-version-control.md @@ -3,6 +3,8 @@ title: "About git" id: git-version-control description: "Git overview" sidebar_label: "About git" +pagination_next: "docs/collaborate/git/version-control-basics" +pagination_prev: null --- A [version control](https://en.wikipedia.org/wiki/Version_control) system allows you and your teammates to work collaboratively, safely, and simultaneously on a single project. Version control helps you track all the code changes made in your dbt project. @@ -19,6 +21,7 @@ When you develop in the command line interface (CLI) or Cloud integrated develo ## Related docs - [Version control basics](/docs/collaborate/git/version-control-basics) -- [Resolve merge conflicts](/docs/collaborate/git/resolve-merge-conflicts) -- [Connect to GitHub](/docs/collaborate/git/connect-github) -- [Connect to GitLab](/docs/collaborate/git/connect-gitlab) +- [Merge conflicts](/docs/collaborate/git/merge-conflicts) +- [Connect to GitHub](/docs/cloud/git/connect-github) +- [Connect to GitLab](/docs/cloud/git/connect-gitlab) +- [Connect to Azure DevOps](/docs/cloud/git/connect-azure-devops) diff --git a/website/docs/docs/collaborate/git/connect-github.md b/website/docs/docs/collaborate/git/connect-github.md deleted file mode 100644 index 74b416908d0..00000000000 --- a/website/docs/docs/collaborate/git/connect-github.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: "Connecting your GitHub account" -id: "connect-github" -sidebar_label: "Connecting GitHub" ---- - -## Overview - -Connecting your GitHub account to dbt Cloud provides convenience and another layer of security to dbt Cloud: -- Log into dbt Cloud using OAuth through GitHub. -- Import new GitHub repositories with a couple clicks during dbt Cloud project setup. -- Clone repos using HTTPS rather than SSH. -- Trigger [Continuous integration](/docs/deploy/cloud-ci-job) builds when pull requests are opened in GitHub. - - -To connect GitHub in dbt Cloud: -1. A GitHub organization owner must first [install the dbt Cloud application](/docs/collaborate/git/connect-github#installing-dbt-cloud-in-your-github-account) in your team's GitHub account. -2. All other dbt Cloud developers on the account need to [personally authenticate with GitHub](/docs/collaborate/git/connect-github#personally-authenticate-with-github) from dbt Cloud. - -If you are the GitHub organization owner tasked with the installation of the dbt Cloud app in step 1, you will also be automatically personally authenticated after completion, so step 2 will be taken care of. This means teams of one only need to complete step 1. - -:::info Use GitHub On-Premise? -If you're using an On-Premises GitHub deployment, this method will not work for your account. Please instead reference our docs on [importing a project by git URL](/docs/collaborate/git/import-a-project-by-git-url) to set up your connection. This alternative connection method does not include the benefits of the native integration mentioned above. -::: - -## Installing dbt Cloud in your GitHub account - -A GitHub organization owner needs to connect and configure the dbt Cloud app for their GitHub organization. If you are a team of one or wish to connect your personal GitHub account, then these instructions also apply to you. - -To link your dbt Cloud account to your team's GitHub account, navigate to Your Profile settings by clicking the gear icon in the top right. Select **Linked Accounts** from the left menu. - - - -In the Linked Accounts section, you can set up your GitHub account connection to dbt Cloud by clicking **Link** to the right of GitHub. This redirects you to your account on GitHub where you will be asked to install and configure the dbt Cloud application. Select the GitHub organization and repositories dbt Cloud should access. - - - -The dbt Cloud GitHub App requires the following permissions: -- Read access to metadata -- Read and write access to checks, code, commit statuses, pull requests, and workflows - -Once you grant access to the app, you will be redirected back to dbt Cloud and shown a linked account success state. You are now personally authenticated too, and your team members can begin [connecting their profiles](/docs/collaborate/git/connect-github#personally-authenticate-with-github). - -## Configuring the dbt Cloud app in your GitHub account -If you are your GitHub organization owner, you can also configure the dbt Cloud GitHub application to have access to only select repositories. This configuration must be done in GitHub, but we provide an easy link in dbt Cloud to start this process. - - -## Personally authenticate with GitHub -dbt Cloud developers on the Enterprise plan must each connect their GitHub profiles to dbt Cloud, as every developer's read / write access for the dbt repo is checked in the dbt Cloud IDE. dbt Cloud developers on the Team plan do not need to each connect their profiles to GitHub, but it is still recommended to do so. - -To connect a personal GitHub account, dbt Cloud developers should navigate to Your Profile settings by clicking the gear icon in the top right, then select **Linked Accounts** in the left menu. - -If your GitHub account is not connected, you’ll see "No connected account". Select **Link** to begin the setup process. You’ll be redirected to GitHub, and asked to authorize dbt Cloud in a grant screen. - - -Once you approve authorization, you will be redirected to dbt Cloud, and you should now see your connected account. The next time you log into dbt Cloud, you will be able to do so via OAuth through GitHub, and if you're on the Enterprise plan, you're ready to use the dbt Cloud IDE. diff --git a/website/docs/docs/collaborate/git/import-a-project-by-git-url.md b/website/docs/docs/collaborate/git/import-a-project-by-git-url.md deleted file mode 100644 index 5d09e336ccd..00000000000 --- a/website/docs/docs/collaborate/git/import-a-project-by-git-url.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: "Import a project by git URL" -id: "import-a-project-by-git-url" ---- - -In dbt Cloud, you can import a git repository from any valid git URL that points to a dbt project. There are a couple of important considerations to keep in mind when doing this: - -## Git protocols -You must use the `git@...` or `ssh:..`. version of your git URL, not the `https://...` version. dbt Cloud uses the SSH protocol to clone repositories, so dbt Cloud will be unable to clone repos supplied with the HTTP protocol. - -## Managing Deploy Keys -After importing a project by Git URL, dbt Cloud will generate a Deploy Key for your repository. You must provide this Deploy Key in the Repository configuration of your Git host. This Deploy Key should be be configured to allow *read and write access* to the specified repositories. - -**Note**: Each dbt Cloud project will generate a different deploy key when connected to a repo, even if two projects are connected to the same repo. Both deploy keys will need to be supplied to your git provider. - -### GitHub - -:::info Use GitHub? - -If you use GitHub, you can import your repo directly using [dbt Cloud's GitHub Application](/docs/collaborate/git/connect-github). Connecting your repo via the GitHub Application [enables Continuous Integration](/docs/deploy/cloud-ci-job). - -::: - -To add a deploy key to a GitHub account, navigate to the Deploy keys tab of the settings page in your GitHub repository. After supplying a name for the deploy key and pasting in your deploy key (generated by dbt Cloud), be sure to check the "Allow write access" checkbox. After adding this key, dbt Cloud will be able to read and write files in your dbt project. - -See also: [Adding a deploy key in GitHub](https://github.blog/2015-06-16-read-only-deploy-keys/) - - - -### GitLab - -:::info Use GitLab? - -If you use GitLab, you can import your repo directly using [dbt Cloud's GitLab Application](/docs/collaborate/git/connect-gitlab). Connecting your repo via the GitLab Application [enables Continuous Integration](/docs/deploy/cloud-ci-job). - -::: - -To add a deploy key to a GitLab account, navigate to the [SSH keys](https://gitlab.com/profile/keys) tab in the User Settings page of your GitLab account. Next, paste in the deploy key generated by dbt Cloud for your repository. After saving this SSH key, dbt Cloud will be able to read and write files in your GitLab repository. - -See also: [Adding a read only deploy key in GitLab](https://docs.gitlab.com/ee/ssh/#per-repository-deploy-keys) - - - -### BitBucket - -To add a deploy key to a BitBucket account, navigate to "SSH keys" tab in the Personal Settings page of your BitBucket account. Next, click the "Add key" button and paste in the deploy key generated by dbt Cloud for your repository. After saving this SSH key, dbt Cloud will be able to read and write files in your BitBucket repository. - - - -### AWS CodeCommit - -dbt Cloud can work with dbt projects hosted on AWS CodeCommit, but there are some extra steps needed compared to Github or other git providers. This guide will help you connect your CodeCommit-hosted dbt project to dbt Cloud. - -#### Step 1: Create an AWS User for dbt Cloud - -To give dbt Cloud access to your repository, first you'll need to create an AWS IAM user for dbt Cloud. Log into the AWS Console and navigate to the IAM section. Click "Add User", and create a new user with "Programmatic Access". - -This user will need clone access to your repository. The easiest way to set that up is to add the "AWSCodeCommitPowerUser" permission during setup. - -#### Step 2: Import your repository by name - -Open the AWS CodeCommit console and choose your repository. Copy the SSH URL from that page. Next, navigate to the "New Repository" page in dbt Cloud. Choose the "Git URL" tab, and paste in the SSH URL you copied from the console. - -In the newly created Repository details page, you'll see a "Deploy Key" field. Copy the contents of this field as you'll need it for step 3. - -**Note:** The dbt Cloud-generated public key is the only key that will work in the next step. Any other key that has been generated outside of dbt Cloud will not work. - -#### Step 3: Grant dbt Cloud AWS User access - -Open up the newly created dbt Cloud user in the AWS IAM Console. Choose the "Security Credentials" tab and then click "Upload SSH public key". Paste in the contents of the "Public Key" field from the dbt Cloud Repository page. - -Once you've created the key, you'll see an "SSH key ID" for it. You'll need to write into support and share that field so that dbt Cloud support can complete the setup process for you. - -You're all set! Once support handles your request, your project is set up and you can begin executing dbt runs from dbt Cloud. - -### Azure DevOps -:::info Use Azure DevOps? - -If you use Azure DevOps and you are on the dbt Cloud Enterprise plan, you can import your repo directly using [dbt Cloud's Azure DevOps Integration](/docs/collaborate/git/connect-azure-devops). Connecting your repo via the Azure DevOps Application [enables Continuous Integration](/docs/deploy/cloud-ci-job). - -::: - -To add a deploy key to an Azure DevOps account, navigate to the "SSH public keys" page in the User Settings of your user's Azure DevOps account or a service user's account. We recommend using a dedicated service user for the integration to ensure that dbt Cloud's connection to Azure DevOps is not interrupted by changes to user permissions. - - - -Next, click the "+ New Key" button to create a new SSH key for the repository. - - - -Pick a descriptive name for the key and then paste in the deploy key generated by dbt Cloud for your repository. After saving this SSH key, dbt Cloud will be able to read and write files in your Azure DevOps repository. - - - -### Other git providers - -Don't see your git provider here? Please contact support - we're happy to help you set up dbt Cloud with any supported git provider. - -## Limited integration -Some features of dbt Cloud require a tight integration with your git host, e.g. updating Github pull requests with dbt Cloud run statuses. Importing your project by a URL prevents you from using these features. - -Once you give dbt Cloud access to your repository, you can continue to set up your project by adding a connection and creating and running your first dbt Cloud job. diff --git a/website/docs/docs/collaborate/git/managed-repository.md b/website/docs/docs/collaborate/git/managed-repository.md index d7beb38c4f5..db8e9840ccd 100644 --- a/website/docs/docs/collaborate/git/managed-repository.md +++ b/website/docs/docs/collaborate/git/managed-repository.md @@ -15,6 +15,6 @@ To set up a project with a managed repository: 6. Click **Create**. -dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository yourself, you can contact support to have the contents of your repo transferred to you. +dbt Cloud will host and manage this repository for you. If in the future you choose to host this repository elsewhere, you can export the information from dbt Cloud at any time. ** We do not recommend using a managed repository in a production environment. You will not be able to use git features like pull requests which are part of our recommended version control best practices. diff --git a/website/docs/docs/collaborate/git/merge-conflicts.md b/website/docs/docs/collaborate/git/merge-conflicts.md new file mode 100644 index 00000000000..c3c19b1e2a1 --- /dev/null +++ b/website/docs/docs/collaborate/git/merge-conflicts.md @@ -0,0 +1,72 @@ +--- +title: "Merge conflicts" +id: "merge-conflicts" +pagination_next: null +--- + +[Merge conflicts](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/addressing-merge-conflicts/about-merge-conflicts) in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) often occur when multiple users are simultaneously making edits to the same section in the same file. This makes it difficult for Git to decide what changes to incorporate in the final merge. + +The merge conflict process provides users the ability to choose which lines of code they'd like to preserve and commit. This document will show you how to resolve merge conflicts in the dbt Cloud IDE. + +## Identify merge conflicts + +You can experience a merge conflict in two possible ways: + +- Pulling changes from your main branch when someone else has merged a conflicting change. +- Committing your changes to the same branch when someone else has already committed their change first. + +The way to [resolve](#resolve-merge-conflicts) either scenario will be exactly the same. + +For example, if you and a teammate make changes to the same file and commit, you will encounter a merge conflict as soon as you **Commit and sync**. + +The dbt Cloud IDE will display: + +- **Commit and resolve** git action bar under **Version Control** instead of **Commit** — This indicates that the Cloud IDE has detected some conflicts that you need to address. +- A 2-split editor view — The left view includes your code changes and is read-only. The right view includes the additional changes, allows you to edit and marks the conflict with some flags: + +``` +<<<<<< HEAD + your current code +====== + conflicting code +>>>>>> (some branch identifier) +``` +- The file and path colored in red in the **File Explorer**, with a warning icon to highlight files that you need to resolve. +- The file name colored in red in the **Changes** section, with a warning icon. +- If you press commit without resolving the conflict, the dbt Cloud IDE will prompt a pop up box with a list which files need to be resolved. + + + + + + +## Resolve merge conflicts +You can seamlessly resolve merge conflicts that involve competing line changes in the Cloud IDE. + +1. In the dbt Cloud IDE, you can edit the right-side of the conflict file, choose which lines of code you'd like to preserve, and delete the rest. + * Note: The left view editor is read-only and you cannot make changes. +3. Delete the special flags or conflict markers `<<<<<<<`, `=======`, `>>>>>>>` that highlight the merge conflict and also choose which lines of code to preserve. +4. If you have more than one merge conflict in your file, scroll down to the next set of conflict markers and repeat steps one and two to resolve your merge conflict. +5. Press **Save**. You will notice the line highlights disappear and return to a plain background. This means that you've resolved the conflict successfully. +6. Repeat this process for every file that has a merge conflict. + + + + +:::info Edit conflict files +- If you open the conflict file under **Changes**, the file name will display something like `model.sql (last commit)` and is fully read-only and cannot be edited.
+- If you open the conflict file under **File Explorer**, you can edit the file in the right view. +::: + +## Commit changes + +When you've resolved all the merge conflicts, the last step would be to commit the changes you've made. + +1. Click the git action bar **Commit and resolve**. +2. The **Commit Changes** pop up box will confirm that all conflicts have been resolved. Write your commit message and press **Commit Changes** +3. The dbt Cloud IDE will return to its normal state and you can continue developing! + + + + + diff --git a/website/docs/docs/collaborate/git/pr-template.md b/website/docs/docs/collaborate/git/pr-template.md index b6a9493ee27..ddb4948dad9 100644 --- a/website/docs/docs/collaborate/git/pr-template.md +++ b/website/docs/docs/collaborate/git/pr-template.md @@ -2,17 +2,22 @@ title: "PR template" id: "pr-template" --- - ## Configure pull request (PR) template URLs When changes are committed on a branch in the IDE, dbt Cloud can prompt users to open a new Pull Request for the code changes. To enable this functionality, ensure that a PR Template URL is configured in the Repository details page in your -Account Settings. **Note:** If this template URL is unset, then the IDE will -instead show a prompt to merge the changes directly into the `master` branch. +Account Settings. If this setting is blank, the IDE will prompt users to merge the changes directly into their default branch. +### PR Template URL by git provider + +The PR Template URL setting will be automatically set for most repositories, depending on the connection method. + +- If you connect to your repository via in-app integrations with your git provider or the "Git Clone" method via SSH, this URL setting will be auto-populated and editable. + - If you connect via a dbt Cloud [Managed repository](/docs/collaborate/git/managed-repository), this URL will not be set, and the IDE will prompt users to merge the changes directly into their default branch. + The PR template URL supports two variables which can be used to build a URL string. These variables, `{{source}}` and `{{destination}}` return branch names based on the state of the configured Environment and active branch open in the IDE. The `{{source}}` @@ -44,14 +49,6 @@ https://github.com/dbt-labs/jaffle_shop/compare/master..my-branch
-## Configure custom branches - -By default in Development Environments, dbt Cloud attempts to reference the `main` branch in connected repositories. If you want to use a different default branch name, you can configure dbt Cloud with a custom branch setting. - -For example, you can use the `develop` branch of a connected repository. Edit an environment, then in "General settings" select **Only run on a custom branch** , and in "Custom branch" type **develop** or the name of your custom branch. - - - ## Example templates Some common URL templates are provided below, but please note that the exact @@ -75,7 +72,7 @@ https://gitlab.com///-/merge_requests/new?merge_request[source_branch ### BitBucket ``` -https://bitbucket.org///pull-requests/new?source={{source}} +https://bitbucket.org///pull-requests/new?source={{source}}&dest={{destination}} ``` ### AWS CodeCommit diff --git a/website/docs/docs/collaborate/git/resolve-merge-conflicts.md b/website/docs/docs/collaborate/git/resolve-merge-conflicts.md deleted file mode 100644 index 1058046856c..00000000000 --- a/website/docs/docs/collaborate/git/resolve-merge-conflicts.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Resolve merge conflicts" -id: "resolve-merge-conflicts" ---- - -Merge conflicts often occur when multiple users are concurrently making edits to the same section in the same file. This makes it difficult for Git to determine which change should be kept. The merge conflict process gives users the ability to sort out which lines of code should be kept and committed. Here we'll show you how you'd resolve merge conflicts in the IDE! - -## Running into a merge conflict - -In this example we have a column that represents the total number of orders your customer has ever had. It's currently named `number_of_orders` but your stakeholder feels like this could be named better. - -You and a teammate make changes to the column name, but have gone two separate routes. You rename this column to `total_number_of_orders` and your teammate has renamed it `historical_order_count`. - -Your teammate has committed their change first, so you encounter a merge conflict when you press commit. - - - -You will then see that the git action bar shows `commit and resolve...` instead of `commit` indicating that the Cloud IDE has detected some conflicts that need to be addressed. You should also see the conflict section marked with some flags: -``` -<<<<<< HEAD - your current code -====== - conflicting code ->>>>>> (some branch identifier) -``` - - - - - - - -:::info Encountering Merge Conflicts - -You could run into this merge conflict in 2 possible ways: -- Pulling changes from your main branch when someone else has merged a conflicting change -- Committing your changes to the same branch when someone else has already committed their change first (this scenario) - -The way to resolve either scenario will be exactly the same! - -::: - -## Resolving conflicts - - - -1. Choose which lines of code you'd like to preserve. Delete the rest, make sure to also delete the special flags that highlight the merge conflict. -2. Press save! You will notice the line highlights disappearing, returning to a plain white background, which is a good sign that you've resolved the conflict successfully! - -Repeat this process for every file that has a merge conflict. - -## Completing the process - -When you've resolved all the merge conflicts, the last step would be to commit the changes you've made! - -The easiest way to identify whether you've successfully resolved all conflicts would be to check the file tree. If all the files highlighted in red have a file icon next to it, instead of the warning sign, you should be good to go! - - - - - -You will also know it's time to commit when you press the `commit and resolve...` button and the modal does not have any warning messages about unresolved merge conflicts! - - - - - -When you're ready, write your commit message as you normally would and press the `Commit` button! - - diff --git a/website/docs/docs/collaborate/git/version-control-basics.md b/website/docs/docs/collaborate/git/version-control-basics.md index dc304d99ca6..5c88d9536b4 100644 --- a/website/docs/docs/collaborate/git/version-control-basics.md +++ b/website/docs/docs/collaborate/git/version-control-basics.md @@ -9,7 +9,7 @@ sidebar_label: "Version control basics" When you develop in the command line interface (CLI) or Cloud integrated development environment (IDE), you can leverage Git directly to version control your code. To use version control, make sure you are connected to a Git repository in the CLI or Cloud IDE. -You can create a separate branch to develop and make changes. The changes you make aren’t merged into the main branch unless it successfully passes tests. This helps keep the code organized and improves productivity by making the development process smooth. +You can create a separate branch to develop and make changes. The changes you make aren’t merged into the default branch in your connected repository (typically named the `main` branch) unless it successfully passes tests. This helps keep the code organized and improves productivity by making the development process smooth. You can read more about git terminology below and also check out [GitHub Docs](https://docs.github.com/en) as well. @@ -23,7 +23,7 @@ Check out some common git terms below that you might encounter when developing: | Branch | A branch is a parallel version of a repository. It is contained within the repository but does not affect the primary or main branch allowing you to work freely without disrupting the live version. When you've made the changes you want to make, you can merge your branch back into the main branch to publish your changes | | Checkout | The `checkout` command is used to create a new branch, change your current working branch to a different branch, or switch to a different version of a file from a different branch. | | Commit | A commit is a user’s change to a file (or set of files). When you make a commit to save your work, Git creates a unique ID that allows you to keep a record of the specific changes committed along with who made them and when. Commits usually contain a commit message which is a brief description of what changes were made. | -| main | The primary, base branch of all repositories. All committed and accepted changes should be on the main branch. In the Cloud IDE, the main branch is read-only. This is because any changes/edits to code cannot and should not be made directly in the base branch. A new branch should be created in order to make any changes to your project | +| main | The primary, base branch of all repositories. All committed and accepted changes should be on the main branch. In the Cloud IDE, the main branch is read-only. This is because any changes/edits to code cannot and should not be made directly in the base branch. A new branch **must** be created in the dbt Cloud IDE order to make any changes to your project. | | Merge | Merge takes the changes from one branch and adds them into another (usually main) branch. These commits are usually first requested via pull request before being merged by a maintainer. | | Pull Request | If someone has changed code on a separate branch of a project and wants it to be reviewed to add to the main branch, they can submit a pull request. Pull requests ask the repo maintainers to review the commits made, and then, if acceptable, merge the changes upstream. A pull happens when adding the changes to the main branch. | | Push | A `push` updates a remote branch with the commits made to the current branch. You are literally _pushing_ your changes into the remote. | @@ -32,7 +32,7 @@ Check out some common git terms below that you might encounter when developing: ## The git button in the Cloud IDE -You can perform git tasks with the git button in the Cloud IDE. The following are descriptions of each git button command and what they do: +You can perform git tasks with the git button in the [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). The following are descriptions of each git button command and what they do: | Name | Actions | @@ -40,7 +40,7 @@ You can perform git tasks with the git button in the Cloud IDE. The following ar | Abort merge | This option allows you to cancel a merge that had conflicts. Be careful with this action because all changes will be reset and this operation can't be reverted, so make sure to commit or save all your changes before you start a merge. | | Change branch | This option allows you to change between branches (checkout). | | Commit | A commit is an individual change to a file (or set of files). When you make a commit to save your work, Git creates a unique ID (a.k.a. the "SHA" or "hash") that allows you to keep record of the specific changes committed along with who made them and when. Commits usually contain a commit message which is a brief description of what changes were made. When you make changes to your code in the future, you'll need to commit them as well. | -| Create new branch | This allows you to branch off of your base branch and edit your project. You’ll notice after initializing your project that the main branch will be read-only. This is because any changes to code cannot and should not be made directly in the base branch. A new branch should be created in order to make any changes to your project. | +| Create new branch | This allows you to branch off of your base branch and edit your project. You’ll notice after initializing your project that the main branch will be read-only. This is because any changes to code cannot and should not be made directly in the base branch. A new branch **must** be created in the dbt Cloud IDE in order to make any changes to your project. | | Initialize your project | This is done when first setting up your project. Initializing a project creates all required directories and files within an empty repository by using the dbt starter project.

Note: This option will not display if your repo isn't completely empty (i.e. includes a README file).

Once you click **Initialize your project**, click **Commit** to finish setting up your project. | | Open pull request | This allows you to open a pull request in Git for peers to review changes before merging into the base branch. | | Pull changes from master/main | This option is available if you are on any local branch that is behind the remote version of the base branch or the remote version of the branch that you're currently on. | @@ -53,10 +53,30 @@ You can perform git tasks with the git button in the Cloud IDE. The following ar Merge conflicts often occur when multiple users are concurrently making edits to the same section in the same file. This makes it difficult for Git to determine which change should be kept. -Refer to [resolve merge conflicts](/docs/collaborate/git/resolve-merge-conflicts) to learn how to resolve merge conflicts. +Refer to [merge conflicts](/docs/collaborate/git/merge-conflicts) to learn how to resolve merge conflicts. ## The .gitignore file -dbt Labs recommends that you exclude files so they're not tracked by Git and won't slow down your dbt project. +To make sure dbt Cloud runs smoothly, you must exclude certain sub-folders in your git repository containing your dbt project from being tracked by git. You can achieve this by adding three lines to a special file named [.gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore). This file is placed in the root folder of your dbt project. -You can do this with a special file named [.gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) which is automatically included in your dbt project after you initialize it in dbt Cloud. The `.gitignore` file must be placed at the root of your dbt project. +Some git providers will automatically create a 'boilerplate' `.gitignore` file when the repository is created. However, based on dbt Labs' experience, these default `.gitignore` files typically don't include the required entries for dbt Cloud to function correctly. + +The `.gitignore` file can include unrelated files and folders if the code repository requires it. However, the following folders must be included in the `gitignore` file to ensure dbt Cloud operates smoothly: + +``` +dbt_packages/ +logs/ +target/ +``` + +**Note** — By using a trailing slash, these lines in the `gitignore` file serve as 'folder wildcards', excluding all files and folders within those folders from being tracked by git. + + +:::note + +- **dbt Cloud projects created after Dec 1, 2022** — If you use the **Initialize dbt Project** button in the dbt Cloud IDE to setup a new and empty dbt project, dbt Cloud will automatically add a `.gitignore` file with the required entries. If a `.gitignore` file already exists, the necessary folders will be appended to the existing file. + +- **Migrating project from Core to dbt Cloud** — Make sure you check the `.gitignore` file contains the necessary entries. dbt Core doesn't interact with git so dbt Cloud doesn't automatically add or verify entries in the `.gitignore` file. Additionally, if the repository already contains dbt code and doesn't require initialization, dbt Cloud won't add any missing entries to the .gitignore file. +::: + +For additional info or troubleshooting tips please refer to the [detailed FAQ](/faqs/Git/gitignore). diff --git a/website/docs/docs/collaborate/govern/about-model-governance.md b/website/docs/docs/collaborate/govern/about-model-governance.md new file mode 100644 index 00000000000..bbc430845d2 --- /dev/null +++ b/website/docs/docs/collaborate/govern/about-model-governance.md @@ -0,0 +1,20 @@ +--- +title: "About model governance" +id: about-model-governance +description: "Information about new features related to model governance" +pagination_next: "docs/collaborate/govern/model-access" +pagination_prev: null +--- + + +:::info New functionality +This functionality is new in v1.5. +::: + + + +[**Model access**](model-access): Some models are mature, reusable data productions. Others are your team's implementation details on the way there. Mark models as "public" or "private," to make the distinction clear and to control who else can `ref` them. + +[**Model contracts**](model-contracts): Guarantee the shape of a model while it is building to avoid surprises or breaking changes for downstream queries. Explicitly define column names, data types, and constraints (as supported by your data platform). + +[**Model versions**](model-versions): When a breaking change is unavoidable, provide a smoother upgrade pathway by creating a new version of the model. These model versions share a common reference name and can reuse properties & configurations. diff --git a/website/docs/docs/collaborate/govern/model-access.md b/website/docs/docs/collaborate/govern/model-access.md new file mode 100644 index 00000000000..765e833ac0c --- /dev/null +++ b/website/docs/docs/collaborate/govern/model-access.md @@ -0,0 +1,206 @@ +--- +title: "Model access" +id: model-access +sidebar_label: "Model access" +description: "Define model access with group capabilities" +--- + + + +:::info New functionality +This functionality is new in v1.5 — if you have thoughts, participate in [the discussion on GitHub](https://github.com/dbt-labs/dbt-core/discussions/6730)! +::: + +:::info "Model access" is not "User access" + +**Model groups and access** and **user groups and access** mean two different things. "User groups and access" is a specific term used in dbt Cloud to manage permissions. Refer to [User access](/docs/cloud/manage-access/about-user-access) for more info. + +The two concepts will be closely related, as we develop multi-project collaboration workflows this year: +- Users with access to develop in a dbt project can view and modify **all** models in that project, including private models. +- Users in the same dbt Cloud account _without_ access to develop in a project cannot view that project's private models, and they can take a dependency on its public models only. +::: + + + + +## Related documentation +* [`groups`](/docs/build/groups) +* [`access`](/reference/resource-configs/access) + +## Groups + +Models can be grouped under a common designation with a shared owner. For example, you could group together all models owned by a particular team, or related to modeling a specific data source (`github`). + +Why define model `groups`? There are two reasons: +- It turns implicit relationships into an explicit grouping, with a defined owner. By thinking about the interface boundaries _between_ groups, you can have a cleaner (less entangled) DAG. In the future, those interface boundaries could be appropriate as the interfaces between separate projects. +- It enables you to designate certain models as having "private" access—for use exclusively within that group. Other models will be restricted from referencing (taking a dependency on) those models. In the future, they won't be visible to other teams taking a dependency on your project—only "public" models will be. + +If you follow our [best practices for structuring a dbt project](/guides/best-practices/how-we-structure/1-guide-overview), you're probably already using subdirectories to organize your dbt project. It's easy to apply a `group` label to an entire subdirectory at once: + + + +```yml +models: + my_project_name: + marts: + customers: + +group: customer_success + finance: + +group: finance +``` + + + +Each model can only belong to one `group`, and groups cannot be nested. If you set a different `group` in that model's YAML or in-file config, it will override the `group` applied at the project level. + +## Access modifiers + +Some models are implementation details, meant for reference only within their group of related models. Other models should be accessible through the [ref](/reference/dbt-jinja-functions/ref) function across groups and projects. Models can set an [access modifier](https://en.wikipedia.org/wiki/Access_modifiers) to indicate their intended level of accessibility. + +| Access | Referenceable by | +|-----------|----------------------------------------| +| private | same group | +| protected | same project (or installed as package) | +| public | any group, package or project | + +If you try to reference a model outside of its supported access, you will see an error: + +```shell +dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` + +By default, all models are `protected`. This means that other models in the same project can reference them, regardless of their group. This is largely for backwards compatability when assigning groups to an existing set of models, as there may already be existing references across group assignments. + +However, it is recommended to set the access modifier of a new model to `private` to prevent other project resources from taking dependencies on models not intentionally designed for sharing across groups. + + + +```yaml +# First, define the group and owner +groups: + - name: customer_success + owner: + name: Customer Success Team + email: cx@jaffle.shop + +# Then, add 'group' + 'access' modifier to specific models +models: + # This is a public model -- it's a stable & mature interface for other teams/projects + - name: dim_customers + group: customer_success + access: public + + # This is a private model -- it's an intermediate transformation intended for use in this context *only* + - name: int_customer_history_rollup + group: customer_success + access: private + + # This is a protected model -- it might be useful elsewhere in *this* project, + # but it shouldn't be exposed elsewhere + - name: stg_customer__survey_results + group: customer_success + access: protected +``` + + + + + +Models with `materialized` set to `ephemeral` cannot have the access property set to public. + +For example, if you have model confg set as: + + + +```sql + +{{ config(materialized='ephemeral') }} + +``` + + + +And the model contract is defined: + + + +```yaml + +models: + - name: my_model + access: public + +``` + + + +It will lead to the following error: + +``` +❯ dbt parse +02:19:30 Encountered an error: +Parsing Error + Node model.jaffle_shop.my_model with 'ephemeral' materialization has an invalid value (public) for the access field +``` + + + +## FAQs + +### How does model access relate to database permissions? + +These are different! + +Specifying `access: public` on a model does not trigger dbt to automagically grant `select` on that model to every user or role in your data platform when you materialize it. You have complete control over managing database permissions on every model/schema, as makes sense to you & your organization. + +Of course, dbt can facilitate this by means of [the `grants` config](/reference/resource-configs/grants), and other flexible mechanisms. For example: +- Grant access to downstream queriers on public models +- Restrict access to private models, by revoking default/future grants, or by landing them in a different schema + +As we continue to develop multi-project collaboration, `access: public` will mean that other teams are allowed to start taking a dependency on that model. This assumes that they've requested, and you've granted them access, to select from the underlying dataset. + +### How do I ref a model from another project? + + + +In dbt Core v1.5 (and earlier versions), the only way to reference a model from another project is by installing that project as a package, including its full source code. It is not possible to restrict references across projects based on model `access`. + +For more control over per-model access across projects, select v1.6 (or newer) from the version dropdown. + + + + + +You can `ref` a model from another project in two ways: +1. [Project dependency](/docs/collaborate/govern/project-dependencies): In dbt Cloud Enterprise, you can use project dependencies to `ref` a model. dbt Cloud uses a behind-the-scenes metadata service to resolve the reference, enabling efficient collaboration across teams and at scale. +2. ["Package" dependency](/docs/build/packages): Another way to `ref` a model from another project is to treat the other project as a package dependency. This requires installing the other project as a package, including its full source code, as well as its upstream dependencies. + +### How do I restrict access to models defined in a package? + +Source code installed from a package becomes part of your runtime environment. You can call macros and run models as if they were macros and models that you had defined in your own project. + +For this reason, model access restrictions are "off" by default for models defined in packages. You can reference models from that package regardless of their `access` modifier. + +The project being installed as a package can optionally restrict external `ref` access to just its public models. The package maintainer does this by setting a `restrict-access` config to `True` in `dbt_project.yml`. + +By default, the value of this config is `False`. This means that: +- Models in the package with `access: protected` may be referenced by models in the root project, as if they were defined in the same project +- Models in the package with `access: private` may be referenced by models in the root project, so long as they also have the same `group` config + +When `restrict-access: True`: +- Any `ref` from outside the package to a protected or private model in that package will fail. +- Only models with `access: public` can be referenced outside the package. + + + +```yml +restrict-access: True # default is False +``` + + + + diff --git a/website/docs/docs/collaborate/govern/model-contracts.md b/website/docs/docs/collaborate/govern/model-contracts.md new file mode 100644 index 00000000000..442a20df1b6 --- /dev/null +++ b/website/docs/docs/collaborate/govern/model-contracts.md @@ -0,0 +1,212 @@ +--- +title: "Model contracts" +id: model-contracts +sidebar_label: "Model contracts" +description: "Model contracts define a set of parameters validated during transformation" +--- + + + +:::info New functionality +This functionality is new in v1.5 — if you have thoughts, participate in [the discussion on GitHub](https://github.com/dbt-labs/dbt-core/discussions/6726)! +::: + + + + +## Related documentation +* [`contract`](/reference/resource-configs/contract) +* [`columns`](/reference/resource-properties/columns) +* [`constraints`](/reference/resource-properties/constraints) + +## Why define a contract? + +Defining a dbt model is as easy as writing a SQL `select` statement. Your query naturally produces a dataset with columns of names and types based on the columns you select and the transformations you apply. + +While this is ideal for quick and iterative development, for some models, constantly changing the shape of its returned dataset poses a risk when other people and processes are querying that model. It's better to define a set of upfront "guarantees" that define the shape of your model. We call this set of guarantees a "contract." While building your model, dbt will verify that your model's transformation will produce a dataset matching up with its contract, or it will fail to build. + +## Where are contracts supported? + +At present, model contracts are supported for: +- SQL models. Contracts are not yet supported for Python models. +- Models materialized as `table`, `view`, and `incremental` (with `on_schema_change: append_new_columns`). Views offer limited support for column names and data types, but not `constraints`. Contracts are not supported for `ephemeral`-materialized models. +- Certain data platforms, but the supported and enforced `constraints` vary by platform. + +## How to define a contract + +Let's say you have a model with a query like: + + + +```sql +-- lots of SQL + +final as ( + + select + customer_id, + customer_name, + -- ... many more ... + from ... + +) + +select * from final +``` + + + +To enforce a model's contract, set `enforced: true` under the `contract` configuration. + +When enforced, your contract _must_ include every column's `name` and `data_type` (where `data_type` matches one that your data platform understands). + +If your model is materialized as `table` or `incremental`, and depending on your data platform, you may optionally specify additional [constraints](/reference/resource-properties/constraints), such as `not_null` (containing zero null values). + + + +```yaml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: customer_id + data_type: int + constraints: + - type: not_null + - name: customer_name + data_type: string + ... +``` + + + +When building a model with a defined contract, dbt will do two things differently: +1. dbt will run a "preflight" check to ensure that the model's query will return a set of columns with names and data types matching the ones you have defined. This check is agnostic to the order of columns specified in your model (SQL) or YAML spec. +2. dbt will include the column names, data types, and constraints in the DDL statements it submits to the data platform, which will be enforced while building or updating the model's table. + +## Platform constraint support + +Select the adapter-specific tab for more information on [constraint](/reference/resource-properties/constraints) support across platforms. Constraints fall into three categories based on support and platform enforcement: + +- **Supported and enforced** — The model won't build if it violates the constraint. +- **Supported and not enforced** — The platform supports specifying the type of constraint, but a model can still build even if building the model violates the constraint. This constraint exists for metadata purposes only. This is common for modern cloud data warehouses and less common for legacy databases. +- **Not supported and not enforced** — You can't specify the type of constraint for the platform. + + + + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:-----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ✅ Enforced | +| foreign_key | ✅ Supported | ✅ Enforced | +| unique | ❌ Not supported | ❌ Not enforced | +| check | ❌ Not supported | ❌ Not enforced | + + + + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:--------------------| +| not_null | ✅ Supported | ✅ Enforced | +| primary_key | ✅ Supported | ✅ Enforced | +| foreign_key | ✅ Supported | ✅ Enforced | +| unique | ✅ Supported | ✅ Enforced | +| check | ✅ Supported | ✅ Enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + +Currently, `not_null` and `check` constraints are supported and enforced only after a model builds. Because of this platform limitation, dbt considers these constraints `supported` but `not enforced`, which means they're not part of the "model contract" since these constraints can't be enforced at build time. This table will change as the features evolve. + +| Constraint type | Support | Platform enforcement | +|:----------------|:-------------|:---------------------| +| not_null | ✅ Supported | ❌ Not enforced | +| primary_key | ✅ Supported | ❌ Not enforced | +| foreign_key | ✅ Supported | ❌ Not enforced | +| unique | ✅ Supported | ❌ Not enforced | +| check | ✅ Supported | ❌ Not enforced | + + + + + +## FAQs + +### Which models should have contracts? + +Any model meeting the criteria described above _can_ define a contract. We recommend defining contracts for ["public" models](model-access) that are being relied on downstream. +- Inside of dbt: Shared with other groups, other teams, and (in the future) other dbt projects. +- Outside of dbt: Reports, dashboards, or other systems & processes that expect this model to have a predictable structure. You might reflect these downstream uses with [exposures](/docs/build/exposures). + +### How are contracts different from tests? + +A model's contract defines the **shape** of the returned dataset. If the model's logic or input data doesn't conform to that shape, the model does not build. + +[Tests](/docs/build/tests) are a more flexible mechanism for validating the content of your model _after_ it's built. So long as you can write the query, you can run the test. Tests are more configurable, such as with [custom severity thresholds](/reference/resource-configs/severity). They are easier to debug after finding failures, because you can query the already-built model, or [store the failing records in the data warehouse](/reference/resource-configs/store_failures). + +In some cases, you can replace a test with its equivalent constraint. This has the advantage of guaranteeing the validation at build time, and it probably requires less compute (cost) in your data platform. The prerequisites for replacing a test with a constraint are: +- Making sure that your data platform can support and enforce the constraint that you need. Most platforms only enforce `not_null`. +- Materializing your model as `table` or `incremental` (**not** `view` or `ephemeral`). +- Defining a full contract for this model by specifying the `name` and `data_type` of each column. + +**Why aren't tests part of the contract?** In a parallel for software APIs, the structure of the API response is the contract. Quality and reliability ("uptime") are also very important attributes of an API's quality, but they are not part of the contract per se. When the contract changes in a backwards-incompatible way, it is a breaking change that requires a bump in major version. + +### Do I need to define every column for a contract? + +Currently, dbt contracts apply to **all** columns defined in a model, and they require declaring explicit expectations about **all** of those columns. The explicit declaration of a contract is not an accident—it's very much the intent of this feature. + +At the same time, for models with many columns, we understand that this can mean a _lot_ of yaml. We are investigating the feasibility of supporting "inferred" contracts. This would enable you to define constraints and strict data typing for a subset of columns, while still detecting breaking changes on other columns by comparing against the same model in production. This isn't the same as a "partial" contract, because all columns in the model are still checked at runtime, and matched up with what's defined _explicitly_ in your yaml contract or _implicitly_ with the comparison state. If you're interested in "inferred" contract, please upvote or comment on [dbt-core#7432](https://github.com/dbt-labs/dbt-core/issues/7432). + + +### How are breaking changes handled? + +When comparing to a previous project state, dbt will look for breaking changes that could impact downstream consumers. If breaking changes are detected, dbt will present a contract error. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) + +More details are available in the [contract reference](/reference/resource-configs/contract#detecting-breaking-changes). + diff --git a/website/docs/docs/collaborate/govern/model-versions.md b/website/docs/docs/collaborate/govern/model-versions.md new file mode 100644 index 00000000000..49ed65f9a36 --- /dev/null +++ b/website/docs/docs/collaborate/govern/model-versions.md @@ -0,0 +1,423 @@ +--- +title: "Model versions" +id: model-versions +sidebar_label: "Model versions" +description: "Version models to help with lifecycle management" +keyword: governance, model version, model versioning, dbt model versioning +--- + + + +:::info New functionality +This functionality is new in v1.5 — if you have thoughts, participate in [the discussion on GitHub](https://github.com/dbt-labs/dbt-core/discussions/6736)! +::: + + + +import VersionsCallout from '/snippets/_version-callout.md'; + + + +Versioning APIs is a hard problem in software engineering. The root of the challenge is that the producers and consumers of an API have competing incentives: +- Producers of an API need the ability to modify its logic and structure. There is a real cost to maintaining legacy endpoints forever, but losing the trust of downstream users is far costlier. +- Consumers of an API need to trust in its stability: their queries will keep working, and won't break without warning. Although migrating to a newer API version incurs an expense, an unplanned migration is far costlier. + +When sharing a final dbt model with other teams or systems, that model is operating like an API. When the producer of that model needs to make significant changes, how can they avoid breaking the queries of its users downstream? + +Model versioning is a tool to tackle this problem, thoughtfully and head-on. The goal is not to make the problem go away entirely, nor to pretend it's easier or simpler than it is. + +## Related documentation +- [`versions`](/reference/resource-properties/versions) +- [`latest_version`](/reference/resource-properties/latest_version) +- [`include` & `exclude`](/reference/resource-properties/include-exclude) +- [`ref` with `version` argument](/reference/dbt-jinja-functions/ref#versioned-ref) + +## Why version a model? + +If a model defines a ["contract"](/docs/collaborate/govern/model-contracts) (a set of guarantees for its structure), it's also possible to change that model's structure in a way that breaks the previous set of guarantees. This could be as obvious as removing or renaming a column, or more subtle, like changing its data type or nullability. + +One approach is to force every model consumer to immediately handle the breaking change as soon as it's deployed to production. This is actually the appropriate answer at many smaller organizations, or while rapidly iterating on a not-yet-mature set of data models. But it doesn’t scale well beyond that. + +Instead, for mature models at larger organizations, powering queries inside & outside dbt, the model owner can use **model versions** to: +- Test "prerelease" changes (in production, in downstream systems) +- Bump the latest version, to be used as the canonical source of truth +- Offer a migration window off the "old" version + +During that migration window, anywhere that model is being used downstream, it can continue to be referenced at a specific version. + +dbt Core 1.6 introduced first-class support for **deprecating models** by specifying a [`deprecation_date`](/reference/resource-properties/deprecation_date). Taken together, model versions and deprecation offer a pathway for model producers to _sunset_ old models, and consumers the time to _migrate_ across breaking changes. It's a way of managing change across an organization: develop a new version, bump the latest, slate the old version for deprecation, update downstream references, and then remove the old version. + +There is a real trade-off that exists here—the cost to frequently migrate downstream code, and the cost (and clutter) of materializing multiple versions of a model in the data warehouse. Model versions do not make that problem go away, but by setting a deprecation date, and communicating a clear window for consumers to gracefully migrate off old versions, they put a known boundary on the cost of that migration. + +## When should you version a model? + +By enforcing a model's contract, dbt can help you catch unintended changes to column names and data types that could cause a big headache for downstream queriers. If you're making these changes intentionally, you should create a new model version. If you're making a non-breaking change, you don't need a new version—such as adding a new column, or fixing a bug in an existing column's calculation. + +Of course, it's possible to change a model's definition in other ways—recalculating a column in a way that doesn't change its name, data type, or enforceable characteristics—but would substantially change the results seen by downstream queriers. + +This is always a judgment call. As the maintainer of a widely-used model, you know best what's a bug fix and what's an unexpected behavior change. + +The process of sunsetting and migrating model versions requires real work, and likely significant coordination across teams. You should opt for non-breaking changes whenever possible. Inevitably, however, these non-breaking additions will leave your most important models with lots of unused or deprecated columns. + +Rather than constantly adding a new version for each small change, you should opt for a predictable cadence (once or twice a year, communicated well in advance) where you bump the "latest" version of your model, removing columns that are no longer being used. + +## How is this different from "version control"? + +[Version control](/docs/collaborate/git-version-control) allows your team to collaborate simultaneously on a single code repository, manage conflicts between changes, and review changes before deploying into production. In that sense, version control is an essential tool for versioning the deployment of an entire dbt project—always the latest state of the `main` branch. In general, only one version of your project code is deployed into an environment at a time. If something goes wrong, you have the ability to roll back changes by reverting a commit or pull request, or by leveraging data platform capabilities around "time travel." + +When you make updates to a model's source code — its logical definition, in SQL or Python, or related configuration — dbt can [compare your project to the previous state](/reference/node-selection/syntax#about-node-selection), enabling you to rebuild only models that have changed, and models downstream of a change. In this way, it's possible to develop changes to a model, quickly test in CI, and efficiently deploy into production — all coordinated via your version control system. + +**Versioned models are different.** Defining model `versions` is appropriate when people, systems, and processes beyond your team's control, inside or outside of dbt, depend on your models. You can neither simply go migrate them all, nor break their queries on a whim. You need to offer a migration path, with clear diffs and deprecation dates. + +Multiple versions of a model will live in the same code repository at the same time, and be deployed into the same data environment simultaneously. This is similar to how web APIs are versioned: Multiple versions are live simultaneously, two or three, and not more). Over time, newer versions come online, and older versions are sunsetted . + +## How is this different from just creating a new model? + +Honestly, it's only a little bit different! There isn't much magic here, and that's by design. + +You've always been able to copy-paste, create a new model file, and name it `dim_customers_v2.sql`. Why should you opt for a "real" versioned model instead? + +As the **producer** of a versioned model: +- You keep track of all live versions in one place, rather than scattering them throughout the codebase +- You can reuse the model's configuration, and highlight just the diffs between versions +- You can select models to build (or not) based on whether they're a `latest`, `prerelease`, or `old` version +- dbt will notify consumers of your versioned model when new versions become available, or when they are slated for deprecation + +As the **consumer** of a versioned model: +- You use a consistent `ref`, with the option of pinning to a specific live version +- You will be notified throughout the life cycle of a versioned model + +All versions of a model preserve the model's original name. They are `ref`'d by that name, rather than the name of the file that they're defined in. By default, the `ref` resolves to the latest version (as declared by that model's maintainer), but you can also `ref` a specific version of the model, with a `version` keyword. + +Let's say that `dim_customers` has three versions defined: `v2` is the "latest", `v3` is "prerelease," and `v1` is an old version that's still within its deprecation window. Because `v2` is the latest version, it gets some special treatment: it can be defined in a file without a suffix, and `ref('dim_customers')` will resolve to `v2` if a version pin is not specified. The table below breaks down the standard conventions: + +| v | version | `ref` syntax | File name | Database relation | +|---|------------|-------------------------------------------------------|-------------------------------------------------|--------------------------------------------------------------------------| +| 3 | "prerelease" | `ref('dim_customers', v=3)` | `dim_customers_v3.sql` | `analytics.dim_customers_v3` | +| 2 | "latest" | `ref('dim_customers', v=2)` **and** `ref('dim_customers')` | `dim_customers_v2.sql` **or** `dim_customers.sql` | `analytics.dim_customers_v2` **and** `analytics.dim_customers` (recommended) | +| 1 | "old" | `ref('dim_customers', v=1)` | `dim_customers_v1.sql` | `analytics.dim_customers_v1` | + +As you'll see in the implementation section below, a versioned model can reuse the majority of its YAML properties and configuration. Each version needs to only say how it _differs_ from the shared set of attributes. This gives you, as the producer of a versioned model, the opportunity to highlight the differences across versions—which is otherwise difficult to detect in models with dozens or hundreds of columns—and to clearly track, in one place, all versions of the model which are currently live. + +dbt also supports [`version`-based selection](/reference/node-selection/methods#the-version-method). For example, you could define a [default YAML selector](/reference/node-selection/yaml-selectors#default) that avoids running any old model versions in development, even while you continue to run them in production through a sunset and migration period. (You could accomplish something similar by applying `tags` to these models, and cycling through those tags over time.) + + + +```yml +selectors: + - name: exclude_old_versions + default: "{{ target.name == 'dev' }}" + definition: + method: fqn + value: "*" + exclude: + - method: version + value: old +``` + + + +Because dbt knows that these models are _actually the same model_, it can notify downstream consumers as new versions become available, and as older versions are slated for deprecation. + +```bash +Found an unpinned reference to versioned model 'dim_customers'. +Resolving to latest version: my_model.v2 +A prerelease version 3 is available. It has not yet been marked 'latest' by its maintainer. +When that happens, this reference will resolve to my_model.v3 instead. + + Try out v3: {{ ref('my_dbt_project', 'my_model', v='3') }} + Pin to v2: {{ ref('my_dbt_project', 'my_model', v='2') }} +``` + +## How to create a new version of a model + +Most often, you'll start with a model that is not yet versioned. Let's go back in time to when `dim_customers` was a simple standalone model, with an enforced contract. For simplicity, let's pretend it has only two columns, `customer_id` and `country_name`, though most mature models will have many more. + + + +```sql +-- lots of sql + +final as ( + + select + customer_id, + country_name + from ... + +) + +select * from final +``` + + + + + +```yaml +models: + - name: dim_customers + config: + materialized: table + contract: + enforced: true + columns: + - name: customer_id + description: This is the primary key + data_type: int + - name: country_name + description: Where this customer lives + data_type: varchar +``` + + + +Let's say you need to make a breaking change to the model: Removing the `country_name` column, which is no longer reliable. First, create a new model file (SQL or Python) encompassing those breaking changes. + + +The default convention is naming the new file with a `_v` suffix. Let's make a new file, named `dim_customers_v2.sql`. (We don't need to rename the existing model file just yet, while it's still the "latest" version.) + + + +```sql +-- lots of sql + +final as ( + + select + customer_id + -- country_name has been removed! + from ... + +) + +select * from final +``` + + + +Now, you could define properties and configuration for `dim_customers_v2` as a new standalone model, with no actual relation to `dim_customers` save a striking resemblance. Instead, we're going to declare that these are versions of the same model, both named `dim_customers`. We can define their properties in common, and then **just** highlight the diffs between them. (Or, you can choose to define each model version with full specifications, and repeat the values they have in common.) + + + + + + +```yaml +models: + - name: dim_customers + latest_version: 1 + config: + materialized: table + contract: {enforced: true} + columns: + - name: customer_id + description: This is the primary key + data_type: int + - name: country_name + description: Where this customer lives + data_type: varchar + + # Declare the versions, and highlight the diffs + versions: + + - v: 1 + # Matches what's above -- nothing more needed + + - v: 2 + # Removed a column -- this is the breaking change! + columns: + # This means: use the 'columns' list from above, but exclude country_name + - include: all + exclude: [country_name] + +``` + + + + + + + + + +```yaml +models: + - name: dim_customers + latest_version: 1 + + # declare the versions, and fully specify them + versions: + - v: 2 + config: + materialized: table + contract: {enforced: true} + columns: + - name: customer_id + description: This is the primary key + data_type: int + # no country_name column + + - v: 1 + config: + materialized: table + contract: {enforced: true} + columns: + - name: customer_id + description: This is the primary key + data_type: int + - name: country_name + description: Where this customer lives + data_type: varchar +``` + + + + + + + +The configuration above says: Instead of two unrelated models, I have two versioned definitions of the same model: `dim_customers_v1` and `dim_customers_v2`. + +**Where are they defined?** dbt expects each model version to be defined in a file named `_v`. In this case: `dim_customers_v1.sql` and `dim_customers_v2.sql`. It's also possible to define the "latest" version in `dim_customers.sql` (no suffix), without additional configuration. Finally, you can override this convention by setting [`defined_in: any_file_name_you_want`](/reference/resource-properties/versions#defined_in)—but we strongly encourage you to follow the convention, unless you have a very good reason. + +**Where will they be materialized?** Each model version will create a database relation with alias `_v`. In this case: `dim_customers_v1` and `dim_customers_v2`. See [the section below](#configuring-database-location-with-alias) for more details on configuring aliases. + +**Which version is "latest"?** If not specified explicitly, the `latest_version` would be `2`, because it's numerically greatest. In this case, we've explicitly specified that `latest_version: 1`. That means `v2` is a "prerelease," in early development and testing. When we're ready to roll out `v2` to everyone by default, we would bump `latest_version: 2`, or remove `latest_version` from the specification. + +### Configuring versioned models + +You can reconfigure each version independently. For example, you could materialize `v2` as a table and `v1` as a view: + + + +```yml +versions: + - v: 2 + config: + materialized: table + - v: 1 + config: + materialized: view +``` + + + +Like with all config inheritance, any configs set _within_ the versioned model's definition (`.sql` or `.py` file) will take precedence over the configs set in YAML. + +### Configuring database location with `alias` + +Following the example, let's say you wanted `dim_customers_v1` to continue populating the database table named `dim_customers`. That's what the table was named previously, and you may have several other dashboards or tools expecting to read its data from `..dim_customers`. + +You could use the `alias` configuration: + + + +```yml + - v: 1 + config: + alias: dim_customers # keep v1 in its original database location +``` + + + +**The pattern we recommend:** Create a view or table clone with the model's canonical name that always points to the latest version. By following this pattern, you can offer the same flexibility as `ref`, even if someone is querying outside of dbt. Want a specific version? Pin to version X by adding the `_vX` suffix. Want the latest version? No suffix, and the view will redirect you. + +We intend to build this into `dbt-core` as out-of-the-box functionality. (Upvote or comment on [dbt-core#7442](https://github.com/dbt-labs/dbt-core/issues/7442).) In the meantime, you can implement this pattern yourself with a custom macro and post-hook: + + + +```sql +{% macro create_latest_version_view() %} + + -- this hook will run only if the model is versioned, and only if it's the latest version + -- otherwise, it's a no-op + {% if model.get('version') and model.get('version') == model.get('latest_version') %} + + {% set new_relation = this.incorporate(path={"identifier": model['name']}) %} + + {% set existing_relation = load_relation(new_relation) %} + + {% if existing_relation and not existing_relation.is_view %} + {{ drop_relation_if_exists(existing_relation) }} + {% endif %} + + {% set create_view_sql -%} + -- this syntax may vary by data platform + create or replace view {{ new_relation }} + as select * from {{ this }} + {%- endset %} + + {% do log("Creating view " ~ new_relation ~ " pointing to " ~ this, info = true) if execute %} + + {{ return(create_view_sql) }} + + {% else %} + + -- no-op + select 1 as id + + {% endif %} + +{% endmacro %} +``` + + + + + + +```yml +# dbt_project.yml +models: + post-hook: + - "{{ create_latest_version_view() }}" +``` + + + +:::info +If your project has historically implemented [custom aliases](/docs/build/custom-aliases) by reimplementing the `generate_alias_name` macro, and you'd like to start using model versions, you should update your custom implementation to account for model versions. Specifically, we'd encourage you to add [a condition like this one](https://github.com/dbt-labs/dbt-core/blob/ada8860e48b32ac712d92e8b0977b2c3c9749981/core/dbt/include/global_project/macros/get_custom_name/get_custom_alias.sql#L26-L30). + +Your existing implementation of `generate_alias_name` should not encounter any errors upon first upgrading to v1.5. It's only when you create your first versioned model, that you may see an error like: + +```sh +dbt.exceptions.AmbiguousAliasError: Compilation Error + dbt found two resources with the database representation "database.schema.model_name". + dbt cannot create two resources with identical database representations. To fix this, + change the configuration of one of these resources: + - model.project_name.model_name.v1 (models/.../model_name.sql) + - model.project_name.model_name.v2 (models/.../model_name_v2.sql) +``` + +We opted to use `generate_alias_name` for this functionality so that the logic remains accessible to end users, and could be reimplemented with custom logic. +::: + +### Optimizing model versions + +How you define each model version is completely up to you. While it's easy to start by copy-pasting from one model's SQL definition into another, you should think about _what actually is changing_ from one version to another. + +For example, if your new model version is only renaming or removing certain columns, you could define one version as a view on top of the other one: + + + +```sql +{{ config(materialized = 'view') }} + +{% set dim_customers_v1 = ref('dim_customers', v=1) %} + +select +{{ dbt_utils.star(from=dim_customers_v1, except=["country_name"]) }} +from {{ dim_customers_v1 }} +``` + + + +Of course, if one model version makes meaningful and substantive changes to logic in another, it may not be possible to optimize it in this way. At that point, the cost of human intuition and legibility is more important than the cost of recomputing similar transformations. + +We expect to develop more opinionated recommendations as teams start adopting model versions in practice. One recommended pattern we can envision: Prioritize the definition of the `latest_version`, and define other versions (old and prerelease) based on their diffs from the latest. How? +- Define the properties and configuration for the latest version in the top-level model YAML, and the diffs for other versions below (via `include`/`exclude`) +- Where possible, define other versions as `select` transformations, which take the latest version as their starting point +- When bumping the `latest_version`, migrate the SQL and YAML accordingly. + +In the example above, the third point might be tricky. It's easier to _exclude_ `country_name`, than it is to add it back in. Instead, we might need to keep around the full original logic for `dim_customers_v1`—but materialize it as a `view`, to minimize the data warehouse cost of building it. If downstream queriers see slightly degraded performance, it's still significantly better than broken queries, and all the more reason to migrate to the new "latest" version. diff --git a/website/docs/docs/collaborate/govern/project-dependencies.md b/website/docs/docs/collaborate/govern/project-dependencies.md new file mode 100644 index 00000000000..9a1d8b59b68 --- /dev/null +++ b/website/docs/docs/collaborate/govern/project-dependencies.md @@ -0,0 +1,142 @@ +--- +title: "Project dependencies" +id: project-dependencies +sidebar_label: "Project dependencies" +description: "Reference public models across dbt projects" +pagination_next: null +--- + +:::info Available in Public Preview for dbt Cloud Enterprise accounts + +Project dependencies and cross-project `ref` are features available in [dbt Cloud Enterprise](https://www.getdbt.com/pricing), currently in [Public Preview](/docs/dbt-versions/product-lifecycles#dbt-cloud). + +Enterprise users can use these features by designating a [public model](/docs/collaborate/govern/model-access) and adding a [cross-project ref](#how-to-use-ref). +::: + + +For a long time, dbt has supported code reuse and extension by installing other projects as [packages](/docs/build/packages). When you install another project as a package, you are pulling in its full source code, and adding it to your own. This enables you to call macros and run models defined in that other project. + +While this is a great way to reuse code, share utility macros, and establish a starting point for common transformations, it's not a great way to enable collaboration across teams and at scale, especially at larger organizations. + +This year, dbt Labs is introducing an expanded notion of `dependencies` across multiple dbt projects: +- **Packages** — Familiar and pre-existing type of dependency. You take this dependency by installing the package's full source code (like a software library). +- **Projects** — A _new_ way to take a dependency on another project. Using a metadata service that runs behind the scenes, dbt Cloud resolves references on-the-fly to public models defined in other projects. You don't need to parse or run those upstream models yourself. Instead, you treat your dependency on those models as an API that returns a dataset. The maintainer of the public model is responsible for guaranteeing its quality and stability. + + +Starting in dbt v1.6 or higher, `packages.yml` has been renamed to `dependencies.yml`. However, if you need use Jinja within your packages config, such as an environment variable for your private package, you need to keep using `packages.yml` for your packages for now. Refer to the [FAQs](#faqs) for more info. + +## Prerequisites + +In order to add project dependencies and resolve cross-project `ref`, you must: +- Use dbt v1.6 or higher for **both** the upstream ("producer") project and the downstream ("consumer") project. +- Have a deployment environment in the upstream ("producer") project [that is set to be your production environment](/docs/deploy/deploy-environments#set-as-production-environment-beta) +- Have a successful run of the upstream ("producer") project +- Have a multi-tenant or single-tenant [dbt Cloud Enterprise](https://www.getdbt.com/pricing) account (Azure ST is not supported but coming soon) + + +## Example + +As an example, let's say you work on the Marketing team at the Jaffle Shop. The name of your team's project is `jaffle_marketing`: + + + +```yml +name: jaffle_marketing +``` + + + +As part of your modeling of marketing data, you need to take a dependency on two other projects: +- `dbt_utils` as a [package](#packages-use-case): A collection of utility macros that you can use while writing the SQL for your own models. This package is, open-source public, and maintained by dbt Labs. +- `jaffle_finance` as a [project use-case](#projects-use-case): Data models about the Jaffle Shop's revenue. This project is private and maintained by your colleagues on the Finance team. You want to select from some of this project's final models, as a starting point for your own work. + + + +```yml +packages: + - package: dbt-labs/dbt_utils + version: 1.1.1 + +projects: + - name: jaffle_finance # matches the 'name' in their 'dbt_project.yml' +``` + + + +What's happening here? + +The `dbt_utils` package — When you run `dbt deps`, dbt will pull down this package's full contents (100+ macros) as source code and add them to your environment. You can then call any macro from the package, just as you can call macros defined in your own project. + +The `jaffle_finance` projects — This is a new scenario. Unlike installing a package, the models in the `jaffle_finance` project will _not_ be pulled down as source code and parsed into your project. Instead, dbt Cloud provides a metadata service that resolves references to [**public models**](/docs/collaborate/govern/model-access) defined in the `jaffle_finance` project. + +### Advantages + +When you're building on top of another team's work, resolving the references in this way has several advantages: +- You're using an intentional interface designated by the model's maintainer with `access: public`. +- You're keeping the scope of your project narrow, and avoiding unnecessary resources and complexity. This is faster for you and faster for dbt. +- You don't need to mirror any conditional configuration of the upstream project such as `vars`, environment variables, or `target.name`. You can reference them directly wherever the Finance team is building their models in production. Even if the Finance team makes changes like renaming the model, changing the name of its schema, or [bumping its version](/docs/collaborate/govern/model-versions), your `ref` would still resolve successfully. +- You eliminate the risk of accidentally building those models with `dbt run` or `dbt build`. While you can select those models, you can't actually build them. This prevents unexpected warehouse costs and permissions issues. This also ensures proper ownership and cost allocation for each team's models. + +### How to use ref + +**Writing `ref`:** Models referenced from a `project`-type dependency must use [two-argument `ref`](/reference/dbt-jinja-functions/ref#two-argument-variant), including the project name: + + + +```sql +with monthly_revenue as ( + + select * from {{ ref('jaffle_finance', 'monthly_revenue') }} + +), + +... + +``` + + + +**Cycle detection:** Currently, "project" dependencies can only go in one direction, meaning that the `jaffle_finance` project could not add a new model that depends, in turn, on `jaffle_marketing.roi_by_channel`. dbt will check for cycles across projects and raise errors if any are detected. We are considering support for this pattern in the future, whereby dbt would still check for node-level cycles while allowing cycles at the project level. + +For more guidance on how to use dbt Mesh, refer to the dedicated [dbt Mesh guide](/guides/best-practices/how-we-mesh/mesh-1-intro). + +### Comparison + +If you were to instead install the `jaffle_finance` project as a `package` dependency, you would instead be pulling down its full source code and adding it to your runtime environment. This means: +- dbt needs to parse and resolve more inputs (which is slower) +- dbt expects you to configure these models as if they were your own (with `vars`, env vars, etc) +- dbt will run these models as your own unless you explicitly `--exclude` them +- You could be using the project's models in a way that their maintainer (the Finance team) hasn't intended + +There are a few cases where installing another internal project as a package can be a useful pattern: +- Unified deployments — In a production environment, if the central data platform team of Jaffle Shop wanted to schedule the deployment of models across both `jaffle_finance` and `jaffle_marketing`, they could use dbt's [selection syntax](/reference/node-selection/syntax) to create a new "passthrough" project that installed both projects as packages. +- Coordinated changes — In development, if you wanted to test the effects of a change to a public model in an upstream project (`jaffle_finance.monthly_revenue`) on a downstream model (`jaffle_marketing.roi_by_channel`) _before_ introducing changes to a staging or production environment, you can install the `jaffle_finance` package as a package within `jaffle_marketing`. The installation can point to a specific git branch, however, if you find yourself frequently needing to perform end-to-end testing across both projects, we recommend you re-examine if this represents a stable interface boundary. + +These are the exceptions, rather than the rule. Installing another team's project as a package adds complexity, latency, and risk of unnecessary costs. By defining clear interface boundaries across teams, by serving one team's public models as "APIs" to another, and by enabling practitioners to develop with a more narrowly defined scope, we can enable more people to contribute, with more confidence, while requiring less context upfront. + +## FAQs + +
+Can I define private packages in the dependencies.yml file? + +If you're using private packages with the [git token method](/docs/build/packages#git-token-method), you must define them in the `packages.yml` file instead of the `dependencies.yml` file. This is because conditional rendering (like Jinja-in-yaml) is not supported. +
+ + +## Related docs +- Refer to the [dbt Mesh](/guides/best-practices/how-we-mesh/mesh-1-intro) guide for more guidance on how to use dbt Mesh. diff --git a/website/docs/docs/collaborate/manage-access/cloud-seats-and-users.md b/website/docs/docs/collaborate/manage-access/cloud-seats-and-users.md deleted file mode 100644 index d768e7c6e57..00000000000 --- a/website/docs/docs/collaborate/manage-access/cloud-seats-and-users.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: "Seats and Users" -id: "seats-and-users" ---- - -## Overview - -In dbt Cloud, _seats_ are used to allocate users to your account. There are two -different types of seat licenses in dbt Cloud: _Developer_ and _Read Only_. - -The type of license a user is assigned controls which capabilities of dbt -Cloud the user is permitted to access. Users with a Developer license can be -granted access to the Deployment and [Development](/docs/get-started/develop-in-the-cloud) functionality -in dbt Cloud, whereas users with Read Only licenses are intended to view the -[artifacts](docs/dbt-cloud/using-dbt-cloud/artifacts) created in a dbt Cloud account. - -| Functionality | Developer User | Read Only Users | -| ------------- | -------------- | --------------- | -| Use the Developer IDE | ✅ | ❌ | -| Use Jobs | ✅ | ❌ | -| Manage Account | ✅ | ❌ | -| API Access | ✅ | ❌ | -| Use [Source Freshness](/docs/deploy/source-freshness) | ✅ | ✅ | -| Use [Docs](/docs/collaborate/build-and-view-your-docs) | ✅ | ✅ | - -## Included seats - -Each dbt Cloud plan comes with a base number of Developer and Read Only seats. -To add additional seats to your account, navigate to the Billing tab of the -Account Settings page in your dbt Cloud account. Note: accounts on the Developer -plan must upgrade to the Team plan to add additional team members to their -account. - -The below shows the base number of Developer and Read Only seats for each -dbt Cloud plan. - -| Plan | Developer Seats | Read Only Seats | -| ---- | --------------- | --------------- | -| Developer (free) | 1 | 0 | -| Team | $50/developer/mo | 5 ($0/mo) | -| Enterprise | Custom | Custom | - -## Managing license types - -Licenses can be assigned manually, or automatically based on IdP configuration -(enterprise only). By default, new users in an account will be assigned a -Developer license. - -### Manual configuration - -To manually assign a specific type of license to a user on your team, navigate -to the Team page in your Account Settings and click the "edit" button for the user -you want to manage. From this page, you can select the license type and relevant -groups for the user. - -**Note:** You will need to have an available license ready -to allocate for the user. If your account does not have an available license to -allocate, you will need to add more seats to your plan to complete the license -change. - - - -### Mapped configuration - -**Note:** This feature is only available on the Enterprise plan. - -If your account is connected to an Identity Provider (IdP) for [Single Sign -On](/docs/collaborate/manage-access/sso-overview), you can automatically map IdP user -groups to specific license types in dbt Cloud. To configure license mappings, -navigate to the Account Settings > Team > License Mappings page. From -here, you can create or edit SSO mappings for both Read Only and Developer -license types. - -By default, all new members of a dbt Cloud account will be assigned a Developer -license. To assign Read Only licenses to certain groups of users, create a new -License Mapping for the Read Only license type and include a comma separated -list of IdP group names that should receive a Read Only license at sign-in time. - - - -Usage notes: -- If a user's IdP groups match both a Developer and Read Only license type - mapping, a Developer license type will be assigned -- If a user's IdP groups do not match _any_ license type mappings, a Developer - license will be assigned -- License types are adjusted when users sign into dbt Cloud via Single Sign On. - Changes made to license type mappings will take effect the next time users - sign in to dbt Cloud. -- License type mappings are based on _IdP Groups_, not _dbt Cloud groups_, so be - sure to check group memberships in your identity provider when configuring - this feature. - - -## Granular permissioning - -The dbt Cloud Enterprise plan supports Role-Based access controls for -configuring granular in-app permissions. See [access control](/docs/collaborate/manage-access/about-access) -for more information on Enterprise permissioning. diff --git a/website/docs/docs/collaborate/manage-access/enterprise-permissions.md b/website/docs/docs/collaborate/manage-access/enterprise-permissions.md deleted file mode 100644 index 7a0031d3c7a..00000000000 --- a/website/docs/docs/collaborate/manage-access/enterprise-permissions.md +++ /dev/null @@ -1,205 +0,0 @@ ---- -title: "Enterprise permissions" -id: "enterprise-permissions" -description: "Permission sets for Enterprise plans." ---- - -:::info Enterprise Feature - -This guide describes a feature of the dbt Cloud Enterprise plan. -If you're interested in learning more about an Enterprise plan, contact us at sales@getdbt.com. - -::: - -## Overview - -The dbt Cloud Enterprise plan supports a number of pre-built permission sets to -help manage access controls within a dbt Cloud account. See the docs on [access -control](/docs/collaborate/manage-access/about-access) for more information on Role-Based access -control (RBAC). - -## Permission Sets - -The following permission sets are available for assignment in dbt Cloud Enterprise accounts. They -can be granted to dbt Cloud groups which are then in turn granted to users. A dbt Cloud group -can be associated with more than one permission sets. - -### Account Admin - -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Account Admins have unrestricted access to dbt Cloud accounts. Users with Account Admin permissions can: - -- Create, delete and modify all projects in an account -- Create, delete, and modify Repositories -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Jobs -- Create, delete, and modify Groups -- Create, delete, and modify Group Memberships -- Manage Notification Settings -- Manage account-level [artifacts](dbt-cloud/using-dbt-cloud/artifacts) -- View and modify Account Settings -- Use the IDE -- Run and cancel jobs - -### Project Creator -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Project Creators have write and read-only access to dbt Cloud accounts, but do not have the permissions required to modify SSO settings and account integrations. - -Users with Project Creator permissions can: - -- View Account Settings -- View and modify project users -- Create, delete and modify all projects in an account -- Create, delete, and modify Repositories -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Jobs -- Use the IDE -- Run and cancel jobs -- View Groups -- View Notification Settings - -### Account Viewer - -- **Has permissions on:** Authorized projects, account-level settings -- **License restrictions:** must have a developer license - -Account Viewers have read only access to dbt Cloud accounts. Users with Account Viewer permissions can: -- View all projects in an account -- View Account Settings -- View Repositories -- View Connections -- View Environments -- View Jobs -- View Groups -- View Group Memberships -- View Notification Settings -- View account-level artifacts - -### Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Admins have unrestricted access to _projects_ in dbt Cloud accounts which they are members of. -Admins can perform the following actions in projects they are assigned to: -- View project details -- Create, delete, and modify Repositories -- Create, delete, and modify Connections -- Create, delete, and modify Environments -- Create, delete, and modify Jobs -- Create, delete, and modify Group Memberships -- Use the IDE -- Run and cancel jobs - -### Git Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Git Admins can perform the following actions in projects they are assigned to: -- View project details -- Create, delete, and modify Repositories -- View Connections -- View Environments -- View Jobs - -### Database Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Database Admins can perform the following actions in projects they are assigned to: -- View project details -- Create, delete, and modify Connections -- View Repositories -- View Environments -- View Jobs - -### Team Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Team Admins can perform the following actions in projects they are assigned to: -- View project details -- Create, delete, and modify group memberships -- View Repositories -- View Environments -- View Jobs - -### Job Admin -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Job Admins can perform the following actions in projects they are assigned to: -- View, edit, and create environments -- View connections -- Trigger runs -- View historical runs - -### Job Viewer -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Job Viewers can perform the following actions in projects they are assigned to: -- View environments -- View job definitions -- View historical runs - -### Developer -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Developers can perform the following actions in projects they are assigned to: -- Create, delete, and modify Jobs -- Trigger runs -- Use the IDE -- Configure personal developer credentials - -### Analyst -- **Has permissions on:** Authorized projects -- **License restrictions:** must have a developer license - -Analysts can perform the following actions in projects they are assigned to: -- Use the IDE -- Configure personal developer credentials -- View connections -- View environments -- View job definitions -- View historical runs - - -### Stakeholder -- **Has permissions on:** Authorized projects -- **License restrictions:** Intended for use with Read Only licenses, but may be used with Developer licenses. - -Stakeholders can perform the following actions in projects they are assigned to: -- View the Read Only dashboard -- View generated documentation -- View generated source freshness reports - -## Diagram of the Permission Sets - - - -## How to Set Up RBAC Groups in dbt Cloud - -Role-Based Access Control (RBAC) is helpful for automatically assigning permissions to dbt admins based on their SSO provider group associations. - -- **If you are on a dbt Labs Hosted dbt Cloud instance:** -Contact support via the webapp button or support@getdbt.com to turn on this feature. -- **If you are on a customer deployed dbt Cloud instance:** -Contact your account manager for instructions on how to turn on this feature. - -Click the gear icon to the top right and select **Account Settings**. From the **Team** section, click **Groups** - - - -1. Select an existing group or create a new group to add RBAC. Name the group (this can be any name you like, but it's recommended to keep it consistent with the SSO groups). If you have configured SSO with SAML 2.0, you may have to use the GroupID instead of the name of the group. -2. Configure the SSO provider groups you want to add RBAC by clicking **Add** in the **SSO** section. These fields are case sensitive and must match the source group formatting. -3. Configure the permissions for users within those groups by clicking **Add** in the **Access** section of the window. - - -4. When you've completed your configurations, click **Save**. Users will begin to populate the group automatically once they have signed in to dbt Cloud with their SSO credentials. diff --git a/website/docs/docs/community-adapters.md b/website/docs/docs/community-adapters.md new file mode 100644 index 00000000000..87d1bd4981e --- /dev/null +++ b/website/docs/docs/community-adapters.md @@ -0,0 +1,20 @@ +--- +title: "Community adapters" +id: "community-adapters" +--- + +Community adapters are adapter plugins contributed and maintained by members of the community. We welcome and encourage [adapter plugins contributions](/docs/contribute-core-adapters#contribute-to-a-pre-existing-adapter) from the dbt community. Please be mindful that these [community maintainers](/docs/connect-adapters#maintainers) are intrepid volunteers who donate their time and effort — so be kind, understanding, and help out where you can! + +| Data platforms (click to view setup guide) || +| ------------------------------------------ | -------------------------------- | ------------------------------------- | +| [Athena](/docs/core/connect-data-platform/athena-setup) | [Greenplum](/docs/core/connect-data-platform/greenplum-setup) | [Oracle](/docs/core/connect-data-platform/oracle-setup) | +| [Clickhouse](/docs/core/connect-data-platform/clickhouse-setup) | [Hive](/docs/core/connect-data-platform/hive-setup) | [Rockset](/docs/core/connect-data-platform/rockset-setup) | +| [IBM DB2](/docs/core/connect-data-platform/ibmdb2-setup) | [Impala](/docs/core/connect-data-platform/impala-setup) | [SingleStore](/docs/core/connect-data-platform/singlestore-setup) | +| [Doris & SelectDB](/docs/core/connect-data-platform/doris-setup) | [Infer](/docs/core/connect-data-platform/infer-setup) | [SQLite](/docs/core/connect-data-platform/sqlite-setup) | +| [DuckDB](/docs/core/connect-data-platform/duckdb-setup) | [iomete](/docs/core/connect-data-platform/iomete-setup) | [SQL Server & Azure SQL](/docs/core/connect-data-platform/mssql-setup) | +| [Dremio](/docs/core/connect-data-platform/dremio-setup) | [Layer](/docs/core/connect-data-platform/layer-setup) | [Teradata](/docs/core/connect-data-platform/teradata-setup) | +| [Exasol Analytics](/docs/core/connect-data-platform/exasol-setup) | [Materialize](/docs/core/connect-data-platform/materialize-setup) | [TiDB](/docs/core/connect-data-platform/tidb-setup) | +| [Firebolt](/docs/core/connect-data-platform/firebolt-setup) | [MindsDB](/docs/core/connect-data-platform/mindsdb-setup) | [Vertica](/docs/core/connect-data-platform/vertica-setup) | +| [AWS Glue](/docs/core/connect-data-platform/glue-setup) | [MySQL](/docs/core/connect-data-platform/mysql-setup)| [Upsolver](/docs/core/connect-data-platform/upsolver-setup) | +| [Databend Cloud](/docs/core/connect-data-platform/databend-setup) | [fal - Python models](/docs/core/connect-data-platform/fal-setup) | | + diff --git a/website/docs/docs/connect-adapters.md b/website/docs/docs/connect-adapters.md new file mode 100644 index 00000000000..77ead34e51d --- /dev/null +++ b/website/docs/docs/connect-adapters.md @@ -0,0 +1,25 @@ +--- +title: "How to connect to adapters" +id: "connect-adapters" +--- + +Adapters are an essential component of dbt. At their most basic level, they are how dbt connects with the various supported data platforms. At a higher-level, adapters strive to give analytics engineers more transferrable skills as well as standardize how analytics projects are structured. Gone are the days where you have to learn a new language or flavor of SQL when you move to a new job that has a different data platform. That is the power of adapters in dbt — for more detail, read the [What are adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) guide. + +This section provides more details on different ways you can connect dbt to an adapter, and explains what a maintainer is. + +### Set up in dbt Cloud + +Explore the fastest and most reliable way to deploy dbt using dbt Cloud, a hosted architecture that runs dbt Core across your organization. dbt Cloud lets you seamlessly [connect](/docs/cloud/about-cloud-setup) with a variety of [verified](/docs/supported-data-platforms) data platform providers directly in the dbt Cloud UI. + +### Install with dbt Core + +Install dbt Core, an open-source tool, locally using the command line. dbt communicates with a number of different data platforms by using a dedicated adapter plugin for each. When you install dbt Core, you'll also need to install the specific adapter for your database, [connect to dbt Core](/docs/core/about-core-setup), and set up a `profiles.yml` file. + +With a few exceptions [^1], you can install all [Verified adapters](/docs/supported-data-platforms) from PyPI using `pip install adapter-name`. For example to install Snowflake, use the command `pip install dbt-snowflake`. The installation will include `dbt-core` and any other required dependencies, which may include both other dependencies and even other adapter plugins. Read more about [installing dbt](/docs/core/installation). + +[^1]: Here are the two different adapters. Use the PyPI package name when installing with `pip` + + | Adapter repo name | PyPI package name | + | ----------------- | -------------------- | + | `dbt-athena` | `dbt-athena-adapter` | + | `dbt-layer` | `dbt-layer-bigquery` | diff --git a/website/docs/docs/contribute-core-adapters.md b/website/docs/docs/contribute-core-adapters.md new file mode 100644 index 00000000000..553361ee1a2 --- /dev/null +++ b/website/docs/docs/contribute-core-adapters.md @@ -0,0 +1,22 @@ +--- +title: "Contribute to adapters" +id: "contribute-core-adapters" +pagination_next: null +--- + +The dbt Community exists to allow analytics practitioners share their knowledge, help others and collectively to drive forward the discipline of analytics engineering. There are opportunities here for everyone to contribute whether you're at the beginning your analytics engineering journey or you are a seasoned data professional. + +This section explains how you can contribute to existing adapters, or create a new adapter. + +### Contribute to a pre-existing adapter + +Community-supported plugins are works in progress, and anyone is welcome to contribute by testing and writing code. If you're interested in contributing: + +- Join both the dedicated channel, [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM), in [dbt Slack](https://community.getdbt.com/) and the channel for your adapter's data store. Refer to the **Slack Channel** link in the [dbt Core platform](/docs/core/connect-data-platform/profiles.yml) pages. +- Check out the open issues in the plugin's source repository. Use the relevant **GitHub repo** link in the [dbt Core platform](/docs/core/connect-data-platform/profiles.yml) pages. + +### Create a new adapter + +If you see something missing from the lists above, and you're interested in developing an integration, read more about adapters and how they're developed in the [Adapter Development](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) section. + +If you have a new adapter, please add it to this list using a pull request! See [Documenting your adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) for more information. diff --git a/website/docs/docs/core/about-core-setup.md b/website/docs/docs/core/about-core-setup.md new file mode 100644 index 00000000000..a4d5ff09ee3 --- /dev/null +++ b/website/docs/docs/core/about-core-setup.md @@ -0,0 +1,19 @@ +--- +title: About dbt Core setup +id: about-core-setup +description: "Configuration settings for dbt Core." +sidebar_label: "About dbt Core setup" +pagination_next: "docs/core/about-dbt-core" +pagination_prev: null +--- + +dbt Core is an [open-source](https://github.com/dbt-labs/dbt-core) tool that enables data teams to transform data using analytics engineering best practices. You can install dbt locally in your environment and use dbt Core on the command line. It can communicate with databases through adapters. + + This section of our docs will guide you through various settings to get started: + +- [About dbt Core](/docs/core/about-dbt-core) +- [Installing dbt](/docs/core/installation) +- [Connecting to a data platform](/docs/core/connect-data-platform/profiles.yml) +- [How to run your dbt projects](/docs/running-a-dbt-project/run-your-dbt-projects) + +If you need a more detailed first-time setup guide for specific data platforms, read our [quickstart guides](https://docs.getdbt.com/quickstarts). diff --git a/website/docs/docs/core/about-dbt-core.md b/website/docs/docs/core/about-dbt-core.md new file mode 100644 index 00000000000..a35d92420f3 --- /dev/null +++ b/website/docs/docs/core/about-dbt-core.md @@ -0,0 +1,25 @@ +--- +title: "About dbt Core" +id: "about-dbt-core" +sidebar_label: "About dbt Core" +--- + +[dbt Core](https://github.com/dbt-labs/dbt-core) is an open sourced project where you can develop from the command line and run your dbt project. + +To use dbt Core, your workflow generally looks like: + +1. **Build your dbt project in a code editor —** popular choices include VSCode and Atom. + +2. **Run your project from the command line —** macOS ships with a default Terminal program, however you can also use iTerm or the command line prompt within a code editor to execute dbt commands. + +:::info How we set up our computers for working on dbt projects + +We've written a [guide](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) for our recommended setup when running dbt projects using dbt Core. + +::: + +If you're using the command line, we recommend learning some basics of your terminal to help you work more effectively. In particular, it's important to understand `cd`, `ls` and `pwd` to be able to navigate through the directory structure of your computer easily. + +You can find more information on installing and setting up the dbt Core [here](/docs/core/installation). + +**Note** — dbt supports a dbt Cloud CLI and dbt Core, both command line interface tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). diff --git a/website/docs/docs/core/connect-data-platform/about-core-connections.md b/website/docs/docs/core/connect-data-platform/about-core-connections.md new file mode 100644 index 00000000000..a85a32cc031 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/about-core-connections.md @@ -0,0 +1,31 @@ +--- +title: "About dbt Core data platform connections" +id: "about-core-connections" +description: "Information about data platform connections in dbt Core" +sidebar_label: "About data platform connections in dbt Core" +hide_table_of_contents: true +pagination_next: "docs/core/connect-data-platform/profiles.yml" +pagination_prev: null +--- + +dbt Core can connect with a variety of data platform providers including: + +- [Amazon Redshift](/docs/core/connect-data-platform/redshift-setup) +- [Apache Spark](/docs/core/connect-data-platform/spark-setup) +- [Databricks](/docs/core/connect-data-platform/databricks-setup) +- [Google BigQuery](/docs/core/connect-data-platform/bigquery-setup) +- [PostgreSQL](/docs/core/connect-data-platform/postgres-setup) +- [Snowflake](/docs/core/connect-data-platform/snowflake-setup) +- [Starburst or Trino](/docs/core/connect-data-platform/trino-setup) + +dbt communicates with a number of different data platforms by using a dedicated adapter for each. When you install dbt Core, you'll also need to install the specific adapter for your data platform, connect to dbt Core, and set up a [profiles.yml file](/docs/core/connect-data-platform/profiles.yml). You can do this using the command line (CLI). + +Data platforms supported in dbt Core may be verified or unverified, and maintained by dbt Labs, partners, or community members. + +These connection instructions provide the basic fields required for configuring a data platform connection in dbt Cloud. For more detailed guides, which include demo project data, read our [Quickstart guides](https://docs.getdbt.com/docs/quickstarts/overview) + +## Connection profiles + +If you're using dbt from the command line (CLI), you'll need a profiles.yml file that contains the connection details for your data platform. When you run dbt from the CLI, it reads your dbt_project.yml file to find the profile name, and then looks for a profile with the same name in your profiles.yml file. This profile contains all the information dbt needs to connect to your data platform. + +For detailed info, you can refer to the [Connection profiles](/docs/core/connect-data-platform/connection-profiles). diff --git a/website/docs/reference/warehouse-setups/alloydb-setup.md b/website/docs/docs/core/connect-data-platform/alloydb-setup.md similarity index 96% rename from website/docs/reference/warehouse-setups/alloydb-setup.md rename to website/docs/docs/core/connect-data-platform/alloydb-setup.md index b65b0759eed..c3f3ee9cfca 100644 --- a/website/docs/reference/warehouse-setups/alloydb-setup.md +++ b/website/docs/docs/core/connect-data-platform/alloydb-setup.md @@ -11,7 +11,7 @@ meta: slack_channel_name: '#db-postgres' slack_channel_link: 'https://getdbt.slack.com/archives/C0172G2E273' platform_name: 'AlloyDB' - config_page: 'postgres-configs' + config_page: '/reference/resource-configs/postgres-configs' --- ## Overview of AlloyDB support diff --git a/website/docs/reference/warehouse-setups/athena-setup.md b/website/docs/docs/core/connect-data-platform/athena-setup.md similarity index 90% rename from website/docs/reference/warehouse-setups/athena-setup.md rename to website/docs/docs/core/connect-data-platform/athena-setup.md index e95b89d69f4..db218110dc1 100644 --- a/website/docs/reference/warehouse-setups/athena-setup.md +++ b/website/docs/docs/core/connect-data-platform/athena-setup.md @@ -1,17 +1,18 @@ --- title: "Athena setup" +description: "Read this guide to learn about the Athena warehouse setup in dbt." meta: maintained_by: Community - authors: 'Tomme' - github_repo: 'Tomme/dbt-athena' - pypi_package: 'dbt-athena-adapter' - min_core_version: 'v1.0.1' + authors: Community + github_repo: 'dbt-athena/dbt-athena' + pypi_package: 'dbt-athena-community' + min_core_version: 'v1.3.0' cloud_support: Not Supported - min_supported_version: 'engine version 2' + min_supported_version: 'engine version 2 and 3' slack_channel_name: '#db-athena' slack_channel_link: 'https://getdbt.slack.com/archives/C013MLFR7BQ' platform_name: 'Athena' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' ---

Overview of {frontMatter.meta.pypi_package}

diff --git a/website/docs/reference/warehouse-setups/azuresynapse-setup.md b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md similarity index 79% rename from website/docs/reference/warehouse-setups/azuresynapse-setup.md rename to website/docs/docs/core/connect-data-platform/azuresynapse-setup.md index 72630c844da..073e95530c1 100644 --- a/website/docs/reference/warehouse-setups/azuresynapse-setup.md +++ b/website/docs/docs/core/connect-data-platform/azuresynapse-setup.md @@ -1,9 +1,10 @@ --- title: "Microsoft Azure Synapse DWH setup" +description: "Read this guide to learn about the Mircosoft Azure Synapse warehouse setup in dbt." meta: - maintained_by: Community - authors: 'dbt-msft community (https://github.com/dbt-msft)' - github_repo: 'dbt-msft/dbt-synapse' + maintained_by: Microsoft + authors: 'Microsoft (https://github.com/Microsoft)' + github_repo: 'Microsoft/dbt-synapse' pypi_package: 'dbt-synapse' min_core_version: 'v0.18.0' cloud_support: Not Supported @@ -11,12 +12,15 @@ meta: slack_channel_name: '#db-synapse' slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' platform_name: 'Synapse' - config_page: 'no-configs' + config_page: '/reference/resource-configs/azuresynapse-configs' --- -:::info Community plugin +:::info + +The following is a guide to using Azure Synapse Analytics Dedicated SQL Pools, formerly SQL DW. For more info, refer to [What is dedicated SQL pool (formerly SQL DW) in Azure Synapse Analytics?](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-overview-what-is) for more info. + +Refer to [Microsoft Fabric Synapse Data Warehouse](/docs/core/connect-data-platform/fabric-setup) to set it up with dbt. -Some core functionality may be limited. If you're interested in contributing, check out the source code for each repository listed below. ::: @@ -50,7 +54,7 @@ pip is the easiest way to install the adapter: :::info Dedicated SQL only Azure Synapse offers both Dedicated SQL Pools and Serverless SQL Pools. -**Only Dedicated SQL Pools are supported by this adapter. If you really insist on using serverless pools, check out the neglected, experimental project: [dbt-synapse-serverless](https://github.com/dbt-msft/dbt-synapse-serverless)** +**Only Dedicated SQL Pools are supported by this adapter. ::: @@ -66,7 +70,7 @@ Download and install the [Microsoft ODBC Driver 18 for SQL Server](https://docs. If you already have ODBC Driver 17 installed, then that one will work as well. :::tip Default settings change in dbt-synapse v1.2 / ODBC Driver 18 -Microsoft made several changes related to connection encryption. Read more about the changes [here](/reference/warehouse-setups/mssql-setup). +Microsoft made several changes related to connection encryption. Read more about the changes [here](/docs/core/connect-data-platform/mssql-setup). ::: ### Authentication methods @@ -98,4 +102,4 @@ your_profile_name: -You can find all the available options and the documentation and how to configure them on [the documentation page for the dbt-sqlserver adapter](/reference/warehouse-setups/mssql-setup). +You can find all the available options and the documentation and how to configure them on [the documentation page for the dbt-sqlserver adapter](/docs/core/connect-data-platform/mssql-setup). diff --git a/website/docs/reference/warehouse-setups/bigquery-setup.md b/website/docs/docs/core/connect-data-platform/bigquery-setup.md similarity index 87% rename from website/docs/reference/warehouse-setups/bigquery-setup.md rename to website/docs/docs/core/connect-data-platform/bigquery-setup.md index 82c92872758..7a2a445be3f 100644 --- a/website/docs/reference/warehouse-setups/bigquery-setup.md +++ b/website/docs/docs/core/connect-data-platform/bigquery-setup.md @@ -1,5 +1,6 @@ --- title: "BigQuery setup" +description: "Read this guide to learn about the BigQuery warehouse setup in dbt." meta: maintained_by: dbt Labs authors: 'core dbt maintainers' @@ -10,10 +11,13 @@ meta: min_supported_version: 'n/a' slack_channel_name: '#db-bigquery' slack_channel_link: 'https://getdbt.slack.com/archives/C99SNSRTK' - platform_name: 'Big Query' - config_page: 'bigquery-configs' + platform_name: 'BigQuery' + config_page: '/reference/resource-configs/bigquery-configs' --- + + +

Overview of {frontMatter.meta.pypi_package}

    @@ -46,12 +50,12 @@ pip is the easiest way to install the adapter: BigQuery targets can be specified using one of four methods: -1. [oauth via `gcloud`](#oauth-via-gcloud) -2. [oauth token-based](#oauth-token-based) +1. [OAuth via `gcloud`](#oauth-via-gcloud) +2. [OAuth token-based](#oauth-token-based) 3. [service account file](#service-account-file) 4. [service account json](#service-account-json) -For local development, we recommend using the oauth method. If you're scheduling dbt on a server, you should use the service account auth method instead. +For local development, we recommend using the OAuth method. If you're scheduling dbt on a server, you should use the service account auth method instead. BigQuery targets should be set up using the following configuration in your `profiles.yml` file. There are a number of [optional configurations](#optional-configurations) you may specify as well. @@ -80,13 +84,11 @@ my-bigquery-db: **Default project** -New in dbt v0.19.0 - If you do not specify a `project`/`database` and are using the `oauth` method, dbt will use the default `project` associated with your user, as defined by `gcloud config set`. -### Oauth Token-Based +### OAuth Token-Based -See [docs](https://developers.google.com/identity/protocols/oauth2) on using Oauth 2.0 to access Google APIs. +See [docs](https://developers.google.com/identity/protocols/oauth2) on using OAuth 2.0 to access Google APIs. - The `dbt-bigquery` plugin uses the BigQuery Python client library to submit queries. Each query requires two steps: 1. Job creation: Submit the query job to BigQuery, and receive its job ID. 2. Job execution: Wait for the query job to finish executing, and receive its result. @@ -247,11 +247,17 @@ In older versions of `dbt-bigquery`, this same config was called `timeout_second ::: -No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set.) When `job_execution_timeout_seconds` is set, if any dbt query, including a model's SQL transformation, takes longer than 300 seconds to complete, BigQuery might cancel the query and issue the following error: +No timeout is set by default. (For historical reasons, some query types use a default of 300 seconds when the `job_execution_timeout_seconds` configuration is not set). When you do set the `job_execution_timeout_seconds`, if any dbt query takes more than 300 seconds to finish, the dbt-bigquery adapter will run into an exception: ``` Operation did not complete within the designated timeout. ``` + +:::caution Note + +The `job_execution_timeout_seconds` represents the number of seconds to wait for the [underlying HTTP transport](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.QueryJob#google_cloud_bigquery_job_QueryJob_result). It _doesn't_ represent the maximum allowable time for a BigQuery job itself. So, if dbt-bigquery ran into an exception at 300 seconds, the actual BigQuery job could still be running for the time set in BigQuery's own timeout settings. + +::: You can change the timeout seconds for the job execution step by configuring `job_execution_timeout_seconds` in the BigQuery profile: @@ -311,57 +317,6 @@ my-profile: - - - - -BigQuery supports query timeouts. By default, the timeout is set to 300 seconds. If a dbt model takes longer than this timeout to complete, then BigQuery may cancel the query and issue the following error: - -``` - Operation did not complete within the designated timeout. -``` - -To change this timeout, use the `timeout_seconds` configuration: - - - -```yaml -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - timeout_seconds: 600 # 10 minutes -``` - - - -The `retries` profile configuration designates the number of times dbt should retry queries that result in unhandled server errors. This configuration is only specified for BigQuery targets. Example: - - - -```yaml -# This example target will retry BigQuery queries 5 -# times with a delay. If the query does not succeed -# after the fifth attempt, then dbt will raise an error - -my-profile: - target: dev - outputs: - dev: - type: bigquery - method: oauth - project: abc-123 - dataset: my_dataset - retries: 5 -``` - - - - ### Dataset locations @@ -383,12 +338,6 @@ my-profile: ### Maximum Bytes Billed - - -- New in dbt v0.17.0 - - - When a `maximum_bytes_billed` value is configured for a BigQuery profile, queries executed by dbt will fail if they exceed the configured maximum bytes threshhold. This configuration should be supplied as an integer number @@ -435,9 +384,8 @@ my-profile: ``` ### Service Account Impersonation -New in v0.18.0 -This feature allows users authenticating via local oauth to access BigQuery resources based on the permissions of a service account. +This feature allows users authenticating via local OAuth to access BigQuery resources based on the permissions of a service account. ```yaml my-profile: @@ -453,11 +401,10 @@ my-profile: For a general overview of this process, see the official docs for [Creating Short-lived Service Account Credentials](https://cloud.google.com/iam/docs/creating-short-lived-service-account-credentials). - - + + ### Execution project -New in v0.21.0 By default, dbt will use the specified `project`/`database` as both: 1. The location to materialize resources (models, seeds, snapshots, etc), unless they specify a custom `project`/`database` config @@ -497,12 +444,44 @@ my-profile: project: abc-123 dataset: my_dataset - # for dbt Python models + # for dbt Python models to be run on a Dataproc cluster gcs_bucket: dbt-python dataproc_cluster_name: dbt-python dataproc_region: us-central1 ``` +Alternatively, Dataproc Serverless can be used: + +```yaml +my-profile: + target: dev + outputs: + dev: + type: bigquery + method: oauth + project: abc-123 + dataset: my_dataset + + # for dbt Python models to be run on Dataproc Serverless + gcs_bucket: dbt-python + dataproc_region: us-central1 + submission_method: serverless + dataproc_batch: + environment_config: + execution_config: + service_account: dbt@abc-123.iam.gserviceaccount.com + subnetwork_uri: regions/us-central1/subnetworks/dataproc-dbt + labels: + project: my-project + role: dev + runtime_config: + properties: + spark.executor.instances: 3 + spark.driver.memory: 1g +``` + +For a full list of possible configuration fields that can be passed in `dataproc_batch`, refer to the [Dataproc Serverless Batch](https://cloud.google.com/dataproc-serverless/docs/reference/rpc/google.cloud.dataproc.v1#google.cloud.dataproc.v1.Batch) documentation. + ## Required permissions @@ -527,6 +506,6 @@ https://www.googleapis.com/auth/drive.readonly,\ https://www.googleapis.com/auth/iam.test ``` -A browser window should open, and you should be prompted to log into your Google account. Once you've done that, dbt will use your oauth'd credentials to connect to BigQuery! +A browser window should open, and you should be prompted to log into your Google account. Once you've done that, dbt will use your OAuth'd credentials to connect to BigQuery! This command uses the `--scopes` flag to request access to Google Sheets. This makes it possible to transform data in Google Sheets using dbt. If your dbt project does not transform data in Google Sheets, then you may omit the `--scopes` flag. diff --git a/website/docs/reference/warehouse-setups/clickhouse-setup.md b/website/docs/docs/core/connect-data-platform/clickhouse-setup.md similarity index 87% rename from website/docs/reference/warehouse-setups/clickhouse-setup.md rename to website/docs/docs/core/connect-data-platform/clickhouse-setup.md index 5c66eab6df3..fb0965398a2 100644 --- a/website/docs/reference/warehouse-setups/clickhouse-setup.md +++ b/website/docs/docs/core/connect-data-platform/clickhouse-setup.md @@ -1,5 +1,6 @@ --- title: "ClickHouse setup" +description: "Read this guide to learn about the ClickHouse warehouse setup in dbt." meta: maintained_by: Community authors: 'Geoff Genz' @@ -11,7 +12,7 @@ meta: slack_channel_name: '#db-clickhouse' slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' platform_name: 'Clickhouse' - config_page: 'clickhouse-configs' + config_page: '/reference/resource-configs/clickhouse-configs' --- Some core functionality may be limited. If you're interested in contributing, check out the source code for each repository listed below. @@ -47,7 +48,7 @@ pip is the easiest way to install the adapter: ## Connecting to ClickHouse with **dbt-clickhouse** -To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.getdbt.com/dbt-cli/configure-your-profile) to your `profiles.yml` file. A ClickHouse profile conforms to the following syntax: +To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.getdbt.com/docs/core/connection-profiles) to your `profiles.yml` file. A ClickHouse profile conforms to the following syntax: @@ -74,6 +75,7 @@ To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.g compress_block_size: 1048576 database_engine: check_exchange: True + use_lw_deletes: False custom_settings: ``` @@ -83,31 +85,32 @@ To connect to ClickHouse from dbt, you'll need to add a [profile](https://docs.g #### Description of ClickHouse Profile Fields -| Field | Description | -|------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `type` | This must be included either in `profiles.yml` or in the `dbt_project.yml` file. Must be set to `clickhouse`. | -| `schema` | Required. A ClickHouse's database name. The dbt model database.schema.table is not compatible with ClickHouse because ClickHouse does not support a schema. So we use a simple model schema.table, where schema is the ClickHouse's database. We don't recommend using the `default` database. | -| `user` | Required. A ClickHouse username with adequate permissions to access the specified `schema`. | -| `password` | Required. The password associated with the specified `user`. | -| `driver` | Optional. The ClickHouse client interface, `http` or `native`. Defaults to `http` unless the `port` is set to 9440 or 9400, in which case the `native` driver is assumed. | -| `port` | Optional. ClickHouse server port number. Defaults to 8123/8443 (secure) if the driver is `http`, and to 9000/9440(secure) if the driver is `native`. | -| `host` | Optional. The host name of the connection. Default is `localhost`. | -| `retries` | Optional. Number of times to retry the initial connection attempt if the error appears to be recoverable. | -| `verify` | Optional. For (`secure=True`) connections, validate the ClickHouse server TLS certificate, including matching hostname, expiration, and signed by a trusted Certificate Authority. Defaults to True. | -| `secure` | Optional. Whether the connection (either http or native) is secured by TLS. This converts an http driver connection to https, and a native driver connection to the native ClickHouse protocol over TLS. the Defaults to False. | -| `cluster_mode` | Optional. Add connection settings to improve compatibility with clusters using the Replicated Database Engine. Default False. | -| `connect_timeout` | Optional. Connection timeout in seconds. Defaults is 10 seconds. | -| `send_receive_timeout` | Optional. Timeout for receiving data from or sending data to ClickHouse. Defaults to 5 minutes (300 seconds) | -| `sync_request_timeout` | Optional. Timeout for connection ping request (native connection only). Defaults to 5 seconds. | -| `compression` | Optional. Use compression in the connection. Defaults to `False`. If set to `True` for HTTP, this enables gzip compression. If set to `True` for the native protocol, this enabled lz4 compression. Other valid values are `lz4hc` and `zstd` for the native driver only. | -| `compress_block_size` | Optional. Compression block size (in bytes) when using compression with the native driver. Defaults to 1MB | -| `database_engine` | Optional. Database engine to use when creating new ClickHouse schemas (databases). If not set (the default), new databases will use the default ClickHouse database engine (usually Atomic). | +| Field | Description | +|------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `type` | This must be included either in `profiles.yml` or in the `dbt_project.yml` file. Must be set to `clickhouse`. | +| `schema` | Required. A ClickHouse's database name. The dbt model database.schema.table is not compatible with ClickHouse because ClickHouse does not support a schema. So we use a simple model schema.table, where schema is the ClickHouse's database. We don't recommend using the `default` database. | +| `user` | Required. A ClickHouse username with adequate permissions to access the specified `schema`. | +| `password` | Required. The password associated with the specified `user`. | +| `driver` | Optional. The ClickHouse client interface, `http` or `native`. Defaults to `http` unless the `port` is set to 9440 or 9400, in which case the `native` driver is assumed. | +| `port` | Optional. ClickHouse server port number. Defaults to 8123/8443 (secure) if the driver is `http`, and to 9000/9440(secure) if the driver is `native`. | +| `host` | Optional. The host name of the connection. Default is `localhost`. | +| `retries` | Optional. Number of times to retry the initial connection attempt if the error appears to be recoverable. | +| `verify` | Optional. For (`secure=True`) connections, validate the ClickHouse server TLS certificate, including matching hostname, expiration, and signed by a trusted Certificate Authority. Defaults to True. | +| `secure` | Optional. Whether the connection (either http or native) is secured by TLS. This converts an http driver connection to https, and a native driver connection to the native ClickHouse protocol over TLS. the Defaults to False. | +| `cluster_mode` | Optional. Add connection settings to improve compatibility with clusters using the Replicated Database Engine. Default False. | +| `connect_timeout` | Optional. Connection timeout in seconds. Defaults is 10 seconds. | +| `send_receive_timeout` | Optional. Timeout for receiving data from or sending data to ClickHouse. Defaults to 5 minutes (300 seconds) | +| `sync_request_timeout` | Optional. Timeout for connection ping request (native connection only). Defaults to 5 seconds. | +| `compression` | Optional. Use compression in the connection. Defaults to `False`. If set to `True` for HTTP, this enables gzip compression. If set to `True` for the native protocol, this enabled lz4 compression. Other valid values are `lz4hc` and `zstd` for the native driver only. | +| `compress_block_size` | Optional. Compression block size (in bytes) when using compression with the native driver. Defaults to 1MB | +| `database_engine` | Optional. Database engine to use when creating new ClickHouse schemas (databases). If not set (the default), new databases will use the default ClickHouse database engine (usually Atomic). | | `check_exchange` | Optional. On connecting to the ClickHouse, if this is parameter is `True` DBT will validate that the ClickHouse server supports atomic exchange of tables. Using atomic exchange (when available) improves reliability and parallelism. This check is unnecessary for ClickHouse running on recent Linux operating system, and in those circumstances can be disabled by setting `check_exchange` to `False` to avoid additional overhead on startup. Defaults to `True`. | -| `custom_settings` | Optional. A mapping of ClickHouse specific user settings to use with the connection. See the ClickHouse documentation for supported settings. | +| `use_lw_deletes` | Optional. If ClickHouse experimental lightweight deletes are available, use the `delete+insert` strategy as the default strategy for incremental materializations. Defaults to `False` (use legacy strategy). | +| `custom_settings` | Optional. A mapping of ClickHouse specific user settings to use with the connection. See the ClickHouse documentation for supported settings. | #### Troubleshooting Connections If you encounter issues connecting to ClickHouse from dbt, make sure the following criteria are met: -- The engine must be one of the [supported engines](clickhouse-configs#supported-table-engines). +- The engine must be one of the [supported engines](/reference/resource-configs/clickhouse-configs#supported-table-engines). - You must have adequate permissions to access the database. -- If you're not using the default table engine for the database, you must specify a table engine in your model configuration. \ No newline at end of file +- If you're not using the default table engine for the database, you must specify a table engine in your model configuration. diff --git a/website/docs/docs/get-started/connection-profiles.md b/website/docs/docs/core/connect-data-platform/connection-profiles.md similarity index 78% rename from website/docs/docs/get-started/connection-profiles.md rename to website/docs/docs/core/connect-data-platform/connection-profiles.md index b1614a0deeb..8088ff1dfa7 100644 --- a/website/docs/docs/get-started/connection-profiles.md +++ b/website/docs/docs/core/connect-data-platform/connection-profiles.md @@ -4,12 +4,6 @@ id: "connection-profiles" description: "Configure your profile using the command line." --- -## Related documentation - -* [`profiles.yml` reference](/reference/profiles.yml): Learn more about profile configuration. - -## Connecting to your warehouse using the command line - When you invoke dbt from the command line, dbt parses your `dbt_project.yml` and obtains the `profile` name, which dbt needs to connect to your . @@ -23,9 +17,9 @@ profile: 'jaffle_shop' -dbt then checks your `profiles.yml` file for a profile with the same name. A profile contains all the details required to connect to your data warehouse. +dbt then checks your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml) for a profile with the same name. A profile contains all the details required to connect to your data warehouse. - + By default, dbt expects the `profiles.yml` file to be located in the `~/.dbt/` directory. @@ -63,7 +57,7 @@ jaffle_shop: In your `profiles.yml` file, you can store as many profiles as you need. Typically, you would have one profile for each warehouse you use. Most organizations only have one profile. -For information about configuring advanced options, see [the `profiles.yml` reference page](reference/profiles.yml.md). +For information about configuring advanced options, see [the `profiles.yml` reference page](/docs/core/connect-data-platform/profiles.yml). ## About profiles @@ -71,7 +65,7 @@ A profile consists of _targets_, and a specified _default target_. Each _target_ specifies the type of warehouse you are connecting to, the credentials to connect to the warehouse, and some dbt-specific configurations. -The credentials you need to provide in your target varies across warehouses &mdash sample profiles for each supported warehouse are available in the [Supported Data Platforms](supported-data-platforms) section. +The credentials you need to provide in your target varies across warehouses — sample profiles for each supported warehouse are available in the [Supported Data Platforms](/docs/supported-data-platforms) section. **Pro Tip:** You may need to surround your password in quotes if it contains special characters. More details [here](https://stackoverflow.com/a/37015689/10415173). @@ -91,13 +85,13 @@ You can find more information on which values to use in your targets below. :::info Validating your warehouse credentials -Use the [debug](debug) command to check whether you can successfully connect to your warehouse. Simply run `dbt debug` from within a dbt project to test your connection. +Use the [debug](/reference/dbt-jinja-functions/debug-method) command to check whether you can successfully connect to your warehouse. Simply run `dbt debug` from within a dbt project to test your connection. ::: ## Understanding targets in profiles -dbt supports multiple targets within one profile to encourage the use of separate development and production environments as discussed in [Managing Environments](/docs/collaborate/environments). +dbt supports multiple targets within one profile to encourage the use of separate development and production environments as discussed in [dbt Core Environments](/docs/core/dbt-core-environments). A typical profile for an analyst using dbt locally will have a target named `dev`, and have this set as the default. @@ -141,23 +135,15 @@ While the target schema represents the default schema that dbt will use, it may ## Understanding threads -When dbt runs, it creates a directed acyclic graph (DAG) of links between models. The number of threads represents the maximum number of paths through the graph dbt may work on at once – increasing the number of threads can minimize the run time of your project. - -For example, if you specify `threads: 1`, dbt will start building only one model, and finish it, before moving onto the next. Specifying `threads: 8` means that dbt will work on _up to_ 8 models at once without violating dependencies – the actual number of models it can work on will likely be constrained by the available paths through the dependency graph. +When dbt runs, it creates a directed acyclic graph (DAG) of links between models. The number of threads represents the maximum number of paths through the graph dbt may work on at once – increasing the number of threads can minimize the run time of your project. The default value for threads in user profiles is [4 threads](/docs/dbt-versions/release-notes/Dec-2022/default-thread-value). -There's no set limit of the maximum number of threads you can set – while increasing the number of threads generally decreases execution time, there are a number of things to consider: -* Increasing the number of threads increases the load on your warehouse, which may impact other tools in your data stack. For example, if your BI tool uses the same compute resources as dbt, their queries may get queued during a dbt run. -* The number of concurrent queries your database will allow you to run may be a limiting factor in how many models can be actively built – some models may queue while waiting for an available query slot. - -Generally the optimal number of threads depends on your data warehouse and its configuration. It’s best to test different values to find the best number of threads for your project. We recommend setting this to 4 to start with. - -You can use a different number of threads than the value defined in your target by using the `--threads` option when executing a dbt command. +For more information, check out [using threads](/docs/running-a-dbt-project/using-threads). ## Advanced: Customizing a profile directory The parent directory for `profiles.yml` is determined using the following precedence: - + 1. `--profiles-dir` option 1. `DBT_PROFILES_DIR` environment variable @@ -205,4 +191,8 @@ $ export DBT_PROFILES_DIR=path/to/directory ## Advanced: Using environment variables -Credentials can be placed directly into the `profiles.yml` file or loaded from environment variables. Using environment variables is especially useful for production deployments of dbt. You can find more information about environment variables [here](env_var). +Credentials can be placed directly into the `profiles.yml` file or loaded from environment variables. Using environment variables is especially useful for production deployments of dbt. You can find more information about environment variables [here](/reference/dbt-jinja-functions/env_var). + +## Related docs + +* [About `profiles.yml`](/docs/core/connect-data-platform/profiles.yml) to learn more about profile configuration. diff --git a/website/docs/docs/core/connect-data-platform/databend-setup.md b/website/docs/docs/core/connect-data-platform/databend-setup.md new file mode 100644 index 00000000000..daccd14f6c3 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/databend-setup.md @@ -0,0 +1,123 @@ +--- +title: "Databend Cloud setup" +description: "Read this guide to learn about the Databend warehouse setup in dbt." +id: "databend-setup" +meta: + maintained_by: Databend Cloud + authors: Shanjie Han + github_repo: 'databendcloud/dbt-databend' + pypi_package: 'dbt-databend-cloud' + min_core_version: 'v1.0.0' + core_version: 'v1.0.0 and newer' + cloud_support: Not Supported + min_supported_version: 'n/a' + platform_name: 'Databend Cloud' + config_page: '/reference/resource-configs/no-configs' +--- + +:::info Vendor-supported plugin + +Some [core functionality](https://github.com/databendcloud/dbt-databend#supported-features) may be limited. +If you're interested in contributing, check out the source code repository listed below. + +::: + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + + +## Connecting to Databend Cloud with **dbt-databend-cloud** + +### User / Password Authentication + +Configure your dbt profile for using Databend Cloud: + +#### Databend Cloud connection profile + + +```yaml +dbt-databend-cloud: + target: dev + outputs: + dev: + type: databend + host: databend-cloud-host + port: 443 + schema: database_name + user: username + pass: password +``` + + + +#### Description of Profile Fields + +| Option | Description | Required? | Example | +|----------|------------------------------------------------------|-----------|---------------------| +| type | The specific adapter to use | Required | `databend` | +| host | The host (hostname) to connect to | Required | `yourorg.datafusecloud.com` | +| port | The port to use | Required | `443` | +| schema | Specify the schema (database) to build models into | Required | `default` | +| user | The username to use to connect to the host | Required | `dbt_admin` | +| pass | The password to use for authenticating to the host | Required | `awesome_password` | + +## Database User Privileges + +Your database user would be able to have some abilities to read or write, such as `SELECT`, `CREATE`, and so on. +You can find some help [here](https://docs.databend.com/using-databend-cloud/warehouses/connecting-a-warehouse) with Databend Cloud privileges management. + +| Required Privilege | +|------------------------| +| SELECT | +| CREATE | +| CREATE TEMPORARY TABLE | +| CREATE VIEW | +| INSERT | +| DROP | +| SHOW DATABASE | +| SHOW VIEW | +| SUPER | + +## Supported features + + | ok | Feature | +|:--:|:---------------------------:| +| ✅ | Table materialization | +| ✅ | View materialization | +| ✅ | Incremental materialization | +| ❌ | Ephemeral materialization | +| ✅ | Seeds | +| ✅ | Sources | +| ✅ | Custom data tests | +| ✅ | Docs generate | +| ❌ | Snapshots | +| ✅ | Connection retry | + +**Note:** + +* Databend does not support `Ephemeral` and `SnapShot`. You can find more detail [here](https://github.com/datafuselabs/databend/issues/8685) diff --git a/website/docs/docs/core/connect-data-platform/databricks-setup.md b/website/docs/docs/core/connect-data-platform/databricks-setup.md new file mode 100644 index 00000000000..caf52d09de3 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/databricks-setup.md @@ -0,0 +1,165 @@ +--- +title: "Databricks setup" +description: "Read this guide to learn about the Databricks warehouse setup in dbt." +id: "databricks-setup" +meta: + maintained_by: Databricks + authors: 'some dbt loving Bricksters' + github_repo: 'databricks/dbt-databricks' + pypi_package: 'dbt-databricks' + min_core_version: 'v0.18.0' + cloud_support: Supported + min_supported_version: 'Databricks SQL or DBR 12+' + slack_channel_name: '#db-databricks-and-spark' + slack_channel_link: 'https://getdbt.slack.com/archives/CNGCW8HKL' + platform_name: 'Databricks' + config_page: '/reference/resource-configs/databricks-configs' +--- + + + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + +`dbt-databricks` is the recommended adapter for Databricks. It includes features not available in `dbt-spark`, such as: +- Unity Catalog support +- No need to install additional drivers or dependencies for use on the CLI +- Use of Delta Lake for all models out of the box +- SQL macros that are optimized to run with [Photon](https://docs.databricks.com/runtime/photon.html) + +## Connecting to Databricks + +To connect to a data platform with dbt Core, create the appropriate _profile_ and _target_ YAML keys/values in the `profiles.yml` configuration file for your Databricks SQL Warehouse/cluster. This dbt YAML file lives in the `.dbt/` directory of your user/home directory. For more info, refer to [Connection profiles](/docs/core/connect-data-platform/connection-profiles) and [profiles.yml](/docs/core/connect-data-platform/profiles.yml). + +`dbt-databricks` can connect to Databricks SQL Warehouses and all-purpose clusters. Databricks SQL Warehouses is the recommended way to get started with Databricks. + +Refer to the [Databricks docs](https://docs.databricks.com/dev-tools/dbt.html#) for more info on how to obtain the credentials for configuring your profile. + +### Examples {#examples} + +You can use either token-based authentication or OAuth client-based authentication to connect to Databricks. Refer to the following examples for more info on how to configure your profile for each type of authentication. + + + + + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: databricks + catalog: [optional catalog name if you are using Unity Catalog] + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Required Personal Access Token (PAT) if using token-based authentication + threads: [1 or more] # Optional, default 1 +``` + + + + + + + + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: databricks + catalog: [optional catalog name if you are using Unity Catalog] + schema: [schema name] # Required + host: [yourorg.databrickshost.com] # Required + http_path: [/sql/your/http/path] # Required + auth_type: oauth # Required if using OAuth-based authentication + client_id: [OAuth-Client-ID] # The ID of your OAuth application. Required if using OAuth-based authentication + client_secret: [XXXXXXXXXXXXXXXXXXXXXXXXXXX] # OAuth client secret. # Required if using OAuth-based authentication + threads: [1 or more] # Optional, default 1 +``` + + + + + + +## Host parameters + +The following profile fields are always required. + +| Field | Description | Example | +| --------- | ------- | ----------- | +| `host` | The hostname of your cluster.

    Don't include the `http://` or `https://` prefix. | `yourorg.databrickshost.com` | +| `http_path` | The http path to your SQL Warehouse or all-purpose cluster. | `/sql/your/http/path` | +| `schema` | The name of a schema within your cluster's catalog.

    It's _not recommended_ to use schema names that have upper case or mixed case letters. | `my_schema` | + +## Authentication parameters + +The `dbt-databricks` adapter supports both [token-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=token#examples) and [OAuth client-based authentication](/docs/core/connect-data-platform/databricks-setup?tokenoauth=oauth#examples). + +Refer to the following **required** parameters to configure your profile for each type of authentication: + +| Field | Authentication type | Description | Example | Authentication type | +| --------- | ------- | ----------- | ---- | +| `token` | Token-based | The Personal Access Token (PAT) to connect to Databricks. | `dapiXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `client_id` | OAuth-based | The client ID for your Databricks OAuth application.
    | `` | +| `client_secret` | OAuth-based | The client secret for your Databricks OAuth application.
    | `XXXXXXXXXXXXX`
    `XXXXXXXXXXXXXX` | +| `auth_type` | OAuth-based | The type of authorization needed to connect to Databricks.
    | `oauth` | + +## Additional parameters + +The following profile fields are optional to set up. They help you configure how your cluster's session and dbt work for your connection. + +| Profile field | Description | Example | +| ------------- | ------------------- | --------------- | +| `threads` | The number of threads dbt should use (default is `1`) |`8` | +| `connect_retries` | The number of times dbt should retry the connection to Databricks (default is `1`) |`3` | +| `connect_timeout` | How many seconds before the connection to Databricks should timeout (default behavior is no timeouts) | `1000` | +| `session_properties` | This sets the Databricks session properties used in the connection. Execute `SET -v` to see available options |`ansi_mode: true` | + +## Supported Functionality + +### Delta Lake + +Most dbt Core functionality is supported, but some features are only available +on Delta Lake. + +Delta-only features: +1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](/reference/resource-configs/databricks-configs#the-merge-strategy)) +2. [Snapshots](/docs/build/snapshots) + + +### Unity Catalog + +The adapter `dbt-databricks>=1.1.1` supports the 3-level namespace of Unity Catalog (catalog / schema / relations) so you can organize and secure your data the way you like. diff --git a/website/docs/docs/core/connect-data-platform/decodable-setup.md b/website/docs/docs/core/connect-data-platform/decodable-setup.md new file mode 100644 index 00000000000..b43521732d4 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/decodable-setup.md @@ -0,0 +1,156 @@ +--- +title: "Decodable setup" +id: "decodable-setup" +meta: + maintained_by: Decodable + authors: "Decodable Team" + github_repo: 'decodableco/dbt-decodable' + pypi_package: 'dbt-decodable' + min_core_version: '1.3.1' + core_version: '' + cloud_support: Not supported + min_supported_version: 'n/a' + slack_channel_name: '#general' + slack_channel_link: 'https://decodablecommunity.slack.com' + platform_name: 'Decodable' + config_page: '/reference/resource-configs/no-configs' +--- + +:::info Community plugin + +Some core functionality may be limited. If you're interested in contributing, see the source code for the repository listed below. +::: + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version}
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +dbt-decodable is also available on PyPI. pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +
    +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration.

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + + +## Connecting to Decodable with **dbt-decodable** +Do the following steps to connect to Decodable with dbt. + +### Prerequisites +In order to properly connect to Decodable, you must have the Decodable CLI installed and have used it to login to Decodable at least once. See Install the Decodable CLI for more information. + +### Steps +To connect to Decodable with dbt, you'll need to add a Decodable profile to your `profiles.yml` file. A Decodable profile has the following fields. + + + +```yaml +dbt-decodable: + target: dev + outputs: + dev: + type: decodable + database: None + schema: None + account_name: [your account] + profile_name: [name of the profile] + materialize_tests: [true | false] + timeout: [ms] + preview_start: [earliest | latest] + local_namespace: [namespace prefix] + +``` + + + +#### Description of Profile Fields + +| Option | Description | Required? | Example | +|----------|------------------------------------------------------|-----------|---------------------| +| type | The specific adapter to use | Required | `decodable` | +| database | Required but unused by this adapter. | Required | | +| schema | Required but unused by this adapter. | Required | | +| account_name | The name of your Decodable account. | Required | `my_awesome_decodable_account` | +| profile_name | The name of your Decodable profile. | Required | `my_awesome_decodable_profile` | +| materialize_tests | Specify whether to materialize tests as a pipeline/stream pair. Defaults to false. | Optional | `false` | +| timeout | The amount of time, in milliseconds, that a preview request runs. Defaults to 60000. | Optional | `60000` | +| preview_start | Specify where preview should start reading data from. If set to `earliest`, then preview will start reading from the earliest record possible. If set to `latest`, preview will start reading from the latest record. Defaults to `earliest`. | Optional | `latest` | +| local_namespace | Specify a prefix to add to all entities created on Decodable. Defaults to `none`, meaning that no prefix is added. | Optional | `none` | + + +## Supported features + +| Name | Supported | Notes | +|---|---|---| +| Table materialization | Yes | Only table materialization are supported. A dbt table model translates to a pipeline/stream pair in Decodable, both sharing the same name. Pipelines for models are automatically activated upon materialization. To materialize your models, run the `dbt run` command which does the following:
    1. Create a stream with the model's name and schema inferred by Decodable from the model's SQL.
    2. Create a pipeline that inserts the SQL's results into the newly created stream.
    3. Activate the pipeline.
    By default, the adapter does not tear down and recreate the model on Decodable if no changes to the model have been detected. Invoking dbt with the `--full-refresh` flag or setting that configuration option for a specific model causes the corresponding resources on Decodable to be destroyed and built from scratch. | +| View materialization | No | | +| Incremental materialization | No | | +| Ephemeral materialization | No | | +| Seeds | Yes | Running the `dbt seed` command performs the following steps for each specified seed:
    1. Create a REST connection and an associated stream with the same name as the seed.
    2. Activate the connection.
    3. Send the data stored in the seed’s `.csv` file to the connection as events.
    4. Deactivate the connection.
    After the `dbt seed` command has finished running, you can access the seed's data on the newly created stream. | +| Tests | Yes | The `dbt test` command behaves differently depending on the `materialize_tests` option set for the specified target.

    If `materialize_tests = false`, then tests are only run after the preview job has completed and returned results. How long a preview job takes as well as what records are returned are defined by the `timeout` and `preview_start` configurations respectively.

    If `materialize_tests = true`, then dbt persists the specified tests as pipeline/stream pairs in Decodable. Use this configuration to allow for continuous testing of your models. You can run a preview on the created stream with the Decodable CLI or web interface to monitor the results. | +| Sources | No | Sources in dbt correspond to Decodable source connections. However, the `dbt source` command is not supported. | +| Docs generate | No | For details about your models, check your Decodable account. | +| Snapshots | No | Snapshots and the `dbt snapshot` command are not supported. | + +## Additional operations + +`dbt-decodable` provides a set of commands for managing the project’s resources on Decodable. Those commands can be run using `dbt run-operation {name} --args {args}`. + +For example, the following command runs the `delete_streams` operation +``` +dbt run-operation delete_streams --args '{streams: [stream1, stream2], skip_errors: True}' +``` + +stop_pipelines(pipelines) +
    • pipelines: An optional list of pipeline names to deactivate. Defaults to none.
    • +
    +Deactivate all pipelines for resources defined within the project. If the pipelines argument is provided, then only the specified pipelines are deactivated. +

    + + +delete_pipelines(pipelines) +
    • pipelines: An optional list of pipeline names to delete. Defaults to none.
    • +
    +Delete all pipelines for resources defined within the project. If the pipelines argument is provided, then only the specified pipelines are deleted. +

    + + +delete_streams(streams, skip_errors) +
      +
    • streams: An optional list of stream names to delete. Defaults to none.
    • +
    • skip_errors: Specify whether to treat errors as warnings. When set to true, any stream deletion failures are reported as warnings. When set to false, the operation stops when a stream cannot be deleted. Defaults to true.
    • +
    +Delete all streams for resources defined within the project. If a pipeline is associated with a stream, then neither the pipeline nor stream are deleted. See the cleanup operation for a complete removal of stream/pipeline pairs.

    + + +cleanup(list, models, seeds, tests) +
      +
    • list: An optional list of resource entity names to delete. Defaults to none.
    • +
    • models: Specify whether to include models during cleanup. Defaults to true.
    • +
    • seeds: Specify whether to include seeds during cleanup. Defaults to true.
    • +
    • tests: Specify whether to include tests during cleanup. Defaults to true.
    • +
    +

    Delete all Decodable entities resulting from the materialization of the project’s resources, i.e. connections, streams, and pipelines. +If the list argument is provided, then only the specified resource entities are deleted. +If the models, seeds, or test arguments are provided, then those resource types are also included in the cleanup. Tests that have not been materialized are not included in the cleanup. + + + diff --git a/website/docs/docs/core/connect-data-platform/doris-setup.md b/website/docs/docs/core/connect-data-platform/doris-setup.md new file mode 100644 index 00000000000..a7e2ba1ba3e --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/doris-setup.md @@ -0,0 +1,94 @@ +--- +title: "Doris setup" +description: "Read this guide to learn about the Doris warehouse setup in dbt." +id: "doris-setup" +meta: + maintained_by: SelectDB + authors: long2ice,catpineapple + github_repo: 'selectdb/dbt-selectdb' + pypi_package: 'dbt-doris' + min_core_version: 'v1.3.0' + cloud_support: Not Supported + slack_channel_name: '#db-doris' + slack_channel_link: 'https://www.getdbt.com/community' + platform_name: 'Apache Doris / SelectDB' + config_page: '/reference/resource-configs/doris-configs' +--- + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + + +## Connecting to Doris/SelectDB with **dbt-doris** + +### User / Password Authentication + +Configure your dbt profile for using Doris: + +#### Doris connection profile + + +```yaml +dbt-doris: + target: dev + outputs: + dev: + type: doris + host: 127.0.0.1 + port: 9030 + schema: database_name + username: username + password: password + +``` + + + +#### Description of Profile Fields + +| Option | Description | Required? | Example | +|----------|----------------------------------------------------------------------------------------------------------------------------------|-----------|-------------| +| type | The specific adapter to use | Required | `doris` | +| host | The hostname to connect to | Required | `127.0.0.1` | +| port | The port to use | Required | `9030` | +| schema | Specify the schema (database) to build models into, doris have not schema to make a collection of table or view' like PostgreSql | Required | `dbt` | +| username | The username to use to connect to the doris | Required | `root` | +| password | The password to use for authenticating to the doris | Required | `password` | + +## Database User Privileges + +Your Doris/SelectDB database user would be able to have some abilities to read or write. +You can find some help [here](https://doris.apache.org/docs/admin-manual/privilege-ldap/user-privilege) with Doris privileges management. + +| Required Privilege | +|--------------------| +| Select_priv | +| Load_priv | +| Alter_priv | +| Create_priv | +| Drop_priv | diff --git a/website/docs/reference/warehouse-setups/dremio-setup.md b/website/docs/docs/core/connect-data-platform/dremio-setup.md similarity index 68% rename from website/docs/reference/warehouse-setups/dremio-setup.md rename to website/docs/docs/core/connect-data-platform/dremio-setup.md index 66da2980831..fa6ca154fcd 100644 --- a/website/docs/reference/warehouse-setups/dremio-setup.md +++ b/website/docs/docs/core/connect-data-platform/dremio-setup.md @@ -1,17 +1,18 @@ --- title: "Dremio setup" +description: "Read this guide to learn about the Dremio warehouse setup in dbt." meta: maintained_by: Dremio authors: 'Dremio (formerly Fabrice Etanchaud)' github_repo: 'dremio/dbt-dremio' pypi_package: 'dbt-dremio' - min_core_version: 'v1.1.0' + min_core_version: 'v1.2.0' cloud_support: Not Supported min_supported_version: 'Dremio 22.0' slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'Dremio' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Vendor plugin @@ -48,24 +49,31 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    -Follow the repository's link for os dependencies. +Follow the repository's link for OS dependencies. + +:::note +[Model contracts](/docs/collaborate/govern/model-contracts) are not supported. +::: ## Prerequisites for Dremio Cloud Before connecting from project to Dremio Cloud, follow these prerequisite steps: * Ensure that you have the ID of the Sonar project that you want to use. See [Obtaining the ID of a Project](https://docs.dremio.com/cloud/cloud-entities/projects/#obtaining-the-id-of-a-project). * Ensure that you have a personal access token (PAT) for authenticating to Dremio Cloud. See [Creating a Token](https://docs.dremio.com/cloud/security/authentication/personal-access-token/#creating-a-token). +* Ensure that Python 3.9.x or later is installed on the system that you are running dbt on. + ## Prerequisites for Dremio Software * Ensure that you are using version 22.0 or later. +* Ensure that Python 3.9.x or later is installed on the system that you are running dbt on. * Enable these support keys in your Dremio cluster: * `dremio.iceberg.enabled` * `dremio.iceberg.ctas.enabled` * `dremio.execution.support_unlimited_splits` - See Support Keys in the Dremio documentation for the steps. -* If you want to use TLS to secure the connection between dbt and Dremio Software, configure full wire encryption in your Dremio cluster. For instructions, see Configuring Wire Encryption. + See Support Keys in the Dremio documentation for the steps. +* If you want to use TLS to secure the connection between dbt and Dremio Software, configure full wire encryption in your Dremio cluster. For instructions, see Configuring Wire Encryption. ## Initializing a Project @@ -76,11 +84,6 @@ Before connecting from project to Dremio Cloud, follow these prerequisite steps: * `dremio_cloud` for working with Dremio Cloud * `software_with_username_password` for working with a Dremio Software cluster and authenticating to the cluster with a username and a password * `software_with_pat` for working with a Dremio Software cluster and authenticating to the cluster with a personal access token -4. Append these lines to the end of the content of the `dbt_project.yml` file at the root of your project directory: -``` -vars: - dremio:reflections_enabled: false -``` Next, configure the profile for your project. @@ -118,6 +121,10 @@ For descriptions of the configurations in these profiles, see [Configurations](# dev: cloud_host: https://api.dremio.cloud cloud_project_id: [project ID] + object_storage_source: [name] + object_storage_path: [path] + dremio_space: [name] + dremio_space_folder: [path] pat: [personal access token] threads: [integer >= 1] type: dremio @@ -137,6 +144,10 @@ For descriptions of the configurations in these profiles, see [Configurations](# password: [password] port: [port] software_host: [hostname or IP address] + object_storage_source: [name + object_storage_path: [path] + dremio_space: [name] + dremio_space_folder: [path] threads: [integer >= 1] type: dremio use_ssl: [true|false] @@ -155,6 +166,10 @@ For descriptions of the configurations in these profiles, see [Configurations](# pat: [personal access token] port: [port] software_host: [hostname or IP address] + object_storage_source: [name + object_storage_path: [path] + dremio_space: [name] + dremio_space_folder: [path] threads: [integer >= 1] type: dremio use_ssl: [true|false] @@ -169,17 +184,23 @@ For descriptions of the configurations in these profiles, see [Configurations](# ### Configurations Common to Profiles for Dremio Cloud and Dremio Software + | Configuration | Required? | Default Value | Description | | --- | --- | --- | --- | | `type` | Yes | dremio | Auto-populated when creating a Dremio project. Do not change this value. | | `threads` | Yes | 1 | The number of threads the dbt project runs on. | - +| `object_storage_source` | No | $scratch | The name of the filesystem in which to create tables, materialized views, tests, and other objects. The dbt alias is `datalake`. This name corresponds to the name of a source in the **Object Storage** section of the Datasets page in Dremio, which is "Samples" in the following image: ![dbt samples path](/img/reference/dremio-setup/dbt-Samples.png) | +| `object_storage_path` | No | `no_schema` | The path in the filesystem in which to create objects. The default is the root level of the filesystem. The dbt alias is `root_path`. Nested folders in the path are separated with periods. This value corresponds to the path in this location in the Datasets page in Dremio, which is "samples.dremio.com.Dremio University" in the following image: ![dbt samples path](/img/reference/dremio-setup/dbt-SamplesPath.png) | +| `dremio_space` | No | `@\` | The value of the Dremio space in which to create views. The dbt alias is `database`. This value corresponds to the name in this location in the **Spaces** section of the Datasets page in Dremio: ![dbt spaces](/img/reference/dremio-setup/dbt-Spaces.png) | +| `dremio_space_folder` | No | `no_schema` | The folder in the Dremio space in which to create views. The default is the top level in the space. The dbt alias is `schema`. Nested folders are separated with periods. This value corresponds to the path in this location in the Datasets page in Dremio, which is `Folder1.Folder2` in the following image: ![Folder1.Folder2](/img/reference/dremio-setup/dbt-SpacesPath.png) | + ### Configurations in Profiles for Dremio Cloud + | Configuration | Required? | Default Value | Description | | --- | --- | --- | --- | | `cloud_host` | Yes | `https://api.dremio.cloud` | US Control Plane: `https://api.dremio.cloud`

    EU Control Plane: `https://api.eu.dremio.cloud` | | `user` | Yes | None | Email address used as a username in Dremio Cloud | -| `pat` | Yes | None | Personal Access TokenSee Personal Access Tokens for instructions about obtaining a token. | +| `pat` | Yes | None | The personal access token to use for authentication. See [Personal Access Tokens](https://docs.dremio.com/cloud/security/authentication/personal-access-token/) for instructions about obtaining a token. | | `cloud_project_id` | Yes | None | The ID of the Sonar project in which to run transformations. | | `use_ssl` | Yes | `true` | The value must be `true`. | @@ -190,5 +211,5 @@ For descriptions of the configurations in these profiles, see [Configurations](# | `port` | Yes | `9047` | Port for Dremio Software cluster API endpoints. | | `user` | Yes | None | The username of the account to use when logging into the Dremio cluster. | | `password` | Yes, if you are not using the pat configuration. | None | The password of the account to use when logging into the Dremio cluster. | -| `pat` | Yes, if you are not using the user and password configurations. | None | The personal access token to use for authenticating to Dremio.See Personal Access Tokens for instructions about obtaining a token.The use of a personal access token takes precedence if values for the three configurations user, password and pat are specified. | -| `use_ssl` | Yes | `true` | Acceptable values are `true` and `false`. If the value is set to true, ensure that full wire encryption is configured in your Dremio cluster. See [Prerequisites for Dremio Software](#prerequisites-for-dremio-software). | \ No newline at end of file +| `pat` | Yes, if you are not using the user and password configurations. | None | The personal access token to use for authenticating to Dremio. See [Personal Access Tokens](https://docs.dremio.com/software/security/personal-access-tokens/) for instructions about obtaining a token. The use of a personal access token takes precedence if values for the three configurations user, password and pat are specified. | +| `use_ssl` | Yes | `true` | Acceptable values are `true` and `false`. If the value is set to true, ensure that full wire encryption is configured in your Dremio cluster. See [Prerequisites for Dremio Software](#prerequisites-for-dremio-software). | diff --git a/website/docs/reference/warehouse-setups/duckdb-setup.md b/website/docs/docs/core/connect-data-platform/duckdb-setup.md similarity index 78% rename from website/docs/reference/warehouse-setups/duckdb-setup.md rename to website/docs/docs/core/connect-data-platform/duckdb-setup.md index d4488a2ead6..a3fee5a5164 100644 --- a/website/docs/reference/warehouse-setups/duckdb-setup.md +++ b/website/docs/docs/core/connect-data-platform/duckdb-setup.md @@ -1,9 +1,10 @@ --- title: "DuckDB setup" +description: "Read this guide to learn about the DuckDB warehouse setup in dbt." meta: maintained_by: Community authors: 'Josh Wills (https://github.com/jwills)' - github_repo: 'jwills/dbt-duckdb' + github_repo: 'duckdb/dbt-duckdb' pypi_package: 'dbt-duckdb' min_core_version: 'v1.0.1' cloud_support: Not Supported @@ -11,7 +12,7 @@ meta: slack_channel_name: '#db-duckdb' slack_channel_link: 'https://getdbt.slack.com/archives/C039D1J1LA2' platform_name: 'Duck DB' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Community plugin @@ -55,7 +56,9 @@ pip is the easiest way to install the adapter: There is also a `database` field defined in the `DuckDBCredentials` class for consistency with the parent `Credentials` class, but it defaults to `main` and setting it to be something else will likely cause strange things to happen that cannot be fully predicted, so please avoid changing it. -Example: +As of version 1.2.3, you can load any supported [DuckDB extensions](https://duckdb.org/docs/extensions/overview) by listing them in the `extensions` field in your profile. You can also set any additional [DuckDB configuration options](https://duckdb.org/docs/sql/configuration) via the `settings` field, including options that are supported in any loaded extensions. + +For example, to be able to connect to `s3` and read/write `parquet` files using an AWS access key and secret, your profile would look something like this: @@ -66,8 +69,13 @@ your_profile_name: dev: type: duckdb path: 'file_path/database_name.duckdb' - #optional fields - schema: schema_name + extensions: + - httpfs + - parquet + settings: + s3_region: my-aws-region + s3_access_key_id: "{{ env_var('S3_ACCESS_KEY_ID') }}" + s3_secret_access_key: "{{ env_var('S3_SECRET_ACCESS_KEY') }}" ``` diff --git a/website/docs/reference/warehouse-setups/exasol-setup.md b/website/docs/docs/core/connect-data-platform/exasol-setup.md similarity index 81% rename from website/docs/reference/warehouse-setups/exasol-setup.md rename to website/docs/docs/core/connect-data-platform/exasol-setup.md index 1e29db02513..2bf4cd7ffac 100644 --- a/website/docs/reference/warehouse-setups/exasol-setup.md +++ b/website/docs/docs/core/connect-data-platform/exasol-setup.md @@ -1,5 +1,6 @@ --- title: "Exasol setup" +description: "Read this guide to learn about the Exasol warehouse setup in dbt." meta: maintained_by: Community authors: 'Torsten Glunde, Ilija Kutle' @@ -11,7 +12,7 @@ meta: slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'Exasol' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Community plugin @@ -72,5 +73,19 @@ dbt-exasol: dbname: db schema: SCHEMA ``` -
    + +#### Optional parameters + +- **`connection_timeout`** — defaults to pyexasol default +- **`socket_timeout`** — defaults to pyexasol default +- **`query_timeout`** — defaults to pyexasol default +- **`compression`** — default: False +- **`encryption`** — default: False +- **`protocol_version`** — default: v3 +- **`row_separator`** — default: CRLF for windows - LF otherwise +- **`timestamp_format`** — default: `YYYY-MM-DDTHH:MI:SS.FF6` + + + + diff --git a/website/docs/docs/core/connect-data-platform/fabric-setup.md b/website/docs/docs/core/connect-data-platform/fabric-setup.md new file mode 100644 index 00000000000..aa7784d96ec --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/fabric-setup.md @@ -0,0 +1,386 @@ +--- +title: "Microsoft Fabric Synapse Data Warehouse setup" +description: "Read this guide to learn about the Microsoft Fabric Synapse Data Warehouse setup in dbt." +id: fabric-setup +meta: + maintained_by: Microsoft + authors: '[Microsoft](https://github.com/Microsoft)' + github_repo: 'Microsoft/dbt-fabric' + pypi_package: 'dbt-fabric' + min_core_version: '1.4.0' + cloud_support: Not Supported + platform_name: 'Microsoft Fabric' + config_page: '/reference/resource-configs/fabric-configs' +--- + +:::info + +Below is a guide for use with "Synapse Data Warehouse" a new product within Microsoft Fabric (preview) ([more info](https://learn.microsoft.com/en-us/fabric/data-warehouse/data-warehousing#synapse-data-warehouse)) + +To learn how to set up dbt with Azure Synapse Dedicated Pools, see [Microsoft Azure Synapse DWH setup](/docs/core/connect-data-platform/azuresynapse-setup) + +::: + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + +### Prerequisites + +On Debian/Ubuntu make sure you have the ODBC header files before installing + +```bash +sudo apt install unixodbc-dev +``` + +Download and install the [Microsoft ODBC Driver 18 for SQL Server](https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver15). +If you already have ODBC Driver 17 installed, then that one will work as well. + +#### Supported configurations + +* The adapter is tested with Microsoft Fabric Synapse Data Warehouse. +* We test all combinations with Microsoft ODBC Driver 17 and Microsoft ODBC Driver 18. +* The collations we run our tests on are `Latin1_General_100_BIN2_UTF8`. + +The adapter support is not limited to the matrix of the above configurations. If you notice an issue with any other configuration, let us know by opening an issue on [GitHub](https://github.com/microsoft/dbt-fabric). + +## Authentication methods & profile configuration + +### Common configuration + +For all the authentication methods, refer to the following configuration options that can be set in your `profiles.yml` file. +A complete reference of all options can be found [at the end of this page](#reference-of-all-connection-options). + +| Configuration option | Description | Type | Example | +| --------------------- | ---- | ---- | ------- | +| `driver` | The ODBC driver to use | Required | `ODBC Driver 18 for SQL Server` | +| `server` | The server hostname | Required | `localhost` | +| `port` | The server port | Required | `1433` | +| `database` | The database name | Required | Not applicable | +| `schema` | The schema name | Required | `dbo` | +| `retries` | The number of automatic times to retry a query before failing. Defaults to `1`. Queries with syntax errors will not be retried. This setting can be used to overcome intermittent network issues. | Optional | Not applicable | +| `login_timeout` | The number of seconds used to establish a connection before failing. Defaults to `0`, which means that the timeout is disabled or uses the default system settings. | Optional | Not applicable | +| `query_timeout` | The number of seconds used to wait for a query before failing. Defaults to `0`, which means that the timeout is disabled or uses the default system settings. | Optional | Not applicable | +| `schema_authorization` | Optionally set this to the principal who should own the schemas created by dbt. [Read more about schema authorization](#schema-authorization). | Optional | Not applicable | +| `encrypt` | Whether to encrypt the connection to the server. Defaults to `true`. Read more about [connection encryption](#connection-encryption). | Optional | Not applicable | +| `trust_cert` | Whether to trust the server certificate. Defaults to `false`. Read more about [connection encryption](#connection-encryption).| Optional | Not applicable | + +### Connection encryption + +Microsoft made several changes in the release of ODBC Driver 18 that affects how connection encryption is configured. +To accommodate these changes, starting in dbt-sqlserver 1.2.0 or newer the default values of `encrypt` and `trust_cert` have changed. +Both of these settings will now **always** be included in the connection string to the server, regardless if you've left them out of your profile configuration or not. + +* The default value of `encrypt` is `true`, meaning that connections are encrypted by default. +* The default value of `trust_cert` is `false`, meaning that the server certificate will be validated. By setting this to `true`, a self-signed certificate will be accepted. + +More details about how these values affect your connection and how they are used differently in versions of the ODBC driver can be found in the [Microsoft documentation](https://learn.microsoft.com/en-us/sql/connect/odbc/dsn-connection-string-attribute?view=sql-server-ver16#encrypt). + +### Standard SQL Server authentication + +SQL Server and windows authentication are not supported by Microsoft Fabric Synapse Data Warehouse. + +### Azure Active Directory Authentication (AAD) + +Azure Active Directory authentication is a default authentication mechanism in Microsoft Fabric Synapse Data Warehouse. + +The following additional methods are available to authenticate to Azure SQL products: + +* AAD username and password +* Service principal (a.k.a. AAD Application) +* Environment-based authentication +* Azure CLI authentication +* VS Code authentication (available through the automatic option below) +* Azure PowerShell module authentication (available through the automatic option below) +* Automatic authentication + +The automatic authentication setting is in most cases the easiest choice and works for all of the above. + + + + + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: ActiveDirectoryPassword + user: bill.gates@microsoft.com + password: iheartopensource +``` + + + + + + + +Client ID is often also referred to as Application ID. + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: ServicePrincipal + tenant_id: 00000000-0000-0000-0000-000000001234 + client_id: 00000000-0000-0000-0000-000000001234 + client_secret: S3cret! +``` + + + + + + + +This authentication option allows you to dynamically select an authentication method depending on the available environment variables. + +[The Microsoft docs on EnvironmentCredential](https://docs.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) +explain the available combinations of environment variables you can use. + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: environment +``` + + + + + + + +First, install the [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli), then, log in: + +`az login` + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: CLI +``` + + + + + + + +This authentication option will automatically try to use all available authentication methods. + +The following methods are tried in order: + +1. Environment-based authentication +2. Managed Identity authentication. Managed Identity is not supported at this time. +3. Visual Studio authentication (*Windows only, ignored on other operating systems*) +4. Visual Studio Code authentication +5. Azure CLI authentication +6. Azure PowerShell module authentication + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: auto +``` + + + + + + + +#### Additional options for AAD on Windows + +On Windows systems, the following additional authentication methods are also available for Azure SQL: + +* AAD interactive +* AAD integrated +* Visual Studio authentication (available through the automatic option above) + + + + + +This setting can optionally show Multi-Factor Authentication prompts. + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: ActiveDirectoryInteractive + user: bill.gates@microsoft.com +``` + + + + + + + +This uses the credentials you're logged in with on the current machine. + + + +```yaml +your_profile_name: + target: dev + outputs: + dev: + type: fabric + driver: 'ODBC Driver 18 for SQL Server' # (The ODBC Driver installed on your system) + server: hostname or IP of your server + port: 1433 + database: exampledb + schema: schema_name + authentication: ActiveDirectoryIntegrated +``` + + + + + + + +### Automatic AAD principal provisioning for grants + +Please note that automatic AAD principal provisioning is not supported by Microsoft Fabric Synapse Data Warehouse at this time. Even though in dbt 1.2 or newer you can use the [grants](https://docs.getdbt.com/reference/resource-configs/grants) config block to automatically grant/revoke permissions on your models to users or groups, the data warehouse does not support this feature at this time. + +You need to add the service principal or AAD identity to a Fabric Workspace as an admin + +### Schema authorization + +You can optionally set the principal who should own all schemas created by dbt. This is then used in the `CREATE SCHEMA` statement like so: + +```sql +CREATE SCHEMA [schema_name] AUTHORIZATION [schema_authorization] +``` + +A common use case is to use this when you are authenticating with a principal who has permissions based on a group, such as an AAD group. When that principal creates a schema, the server will first try to create an individual login for this principal and then link the schema to that principal. If you would be using Azure AD in this case, +then this would fail since Azure SQL can't create logins for individuals part of an AD group automatically. + +### Reference of all connection options + +| Configuration option | Description | Required | Default value | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------- | +| `driver` | The ODBC driver to use. | :white_check_mark: | | +| `host` | The hostname of the database server. | :white_check_mark: | | +| `port` | The port of the database server. | | `1433` | +| `database` | The name of the database to connect to. | :white_check_mark: | | +| `schema` | The schema to use. | :white_check_mark: | | +| `authentication` | The authentication method to use. This is not required for Windows authentication. | | `'sql'` | +| `UID` | Username used to authenticate. This can be left out depending on the authentication method. | | | +| `PWD` | Password used to authenticate. This can be left out depending on the authentication method. | | | +| `tenant_id` | The tenant ID of the Azure Active Directory instance. This is only used when connecting to Azure SQL with a service principal. | | | +| `client_id` | The client ID of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | +| `client_secret` | The client secret of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | +| `encrypt` | Set this to `false` to disable the use of encryption. See [above](#connection-encryption). | | `true` | +| `trust_cert` | Set this to `true` to trust the server certificate. See [above](#connection-encryption). | | `false` | +| `retries` | The number of times to retry a failed connection. | | `1` | +| `schema_authorization` | Optionally set this to the principal who should own the schemas created by dbt. [Details above](#schema-authorization). | | | +| `login_timeout` | The amount of seconds to wait until a response from the server is received when establishing a connection. `0` means that the timeout is disabled. | | `0` | +| `query_timeout` | The amount of seconds to wait until a response from the server is received when executing a query. `0` means that the timeout is disabled. | | `0` | + +Valid values for `authentication`: + +* `ActiveDirectoryPassword`: Active Directory authentication using username and password +* `ActiveDirectoryInteractive`: Active Directory authentication using a username and MFA prompts +* `ActiveDirectoryIntegrated`: Active Directory authentication using the current user's credentials +* `ServicePrincipal`: Azure Active Directory authentication using a service principal +* `CLI`: Azure Active Directory authentication using the account you're logged in within the Azure CLI +* `environment`: Azure Active Directory authentication using environment variables as documented [here](https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity.environmentcredential?view=azure-python) +* `auto`: Azure Active Directory authentication trying the previous authentication methods until it finds one that works diff --git a/website/docs/reference/warehouse-setups/vertica-setup.md b/website/docs/docs/core/connect-data-platform/fal-setup.md similarity index 58% rename from website/docs/reference/warehouse-setups/vertica-setup.md rename to website/docs/docs/core/connect-data-platform/fal-setup.md index 23f5d581a71..ef4998e8c1b 100644 --- a/website/docs/reference/warehouse-setups/vertica-setup.md +++ b/website/docs/docs/core/connect-data-platform/fal-setup.md @@ -1,19 +1,18 @@ --- -title: "Vertica setup" -id: "vertica-setup" +title: "fal setup (Python models)" +description: "Read this guide to learn about the fal warehouse setup in dbt." meta: - maintained_by: Community - authors: Matthew Carter, Andy Regan, Andrew Hedengren - github_repo: 'mpcarter/dbt-vertica' - pypi_package: 'dbt-vertica' - min_core_version: 'v0.21.0' + maintained_by: fal.ai + authors: 'Features & Labels (https://github.com/fal-ai)' + github_repo: 'fal-ai/fal' + pypi_package: 'dbt-fal' + min_core_version: 'v1.3.0' cloud_support: Not Supported - min_supported_version: 'Vertica 10.0' - slack_channel_name: 'n/a' - slack_channel_link: 'https://www.getdbt.com/community/' - platform_name: 'Vertica' - config_page: 'no-configs' - + min_supported_version: 'n/a' + slack_channel_name: '#tools-fal' + slack_channel_link: 'https://getdbt.slack.com/archives/C02V8QW3Q4Q' + platform_name: 'fal' + config_page: '/reference/resource-configs/fal-configs' --- :::info Community plugin @@ -40,10 +39,12 @@ Some core functionality may be limited. If you're interested in contributing, ch pip is the easiest way to install the adapter: -pip install {frontMatter.meta.pypi_package} +pip install {frontMatter.meta.pypi_package}[<sql-adapter>]

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    +

    You must install the adapter for SQL transformations and data storage independently from dbt-fal.

    +

    Configuring {frontMatter.meta.pypi_package}

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    @@ -51,33 +52,26 @@ pip is the easiest way to install the adapter:

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    -### Connecting to Vertica with **dbt-vertica** +## Setting up fal with other adapter -#### Username / password authentication +[fal](http://github.com/fal-ai/fal) offers a Python runtime independent from what database you are using and integrates seamlessly with dbt. It works by downloading the data as a Pandas DataFrame, transforming it in a local Python runtime and uploading it to the database. The only configuration change you need to do is adding it to the `profiles.yml` and setting the `db_profile` property as the database profile you are already using. -Configure your dbt profile for using Vertica: +It will run all the SQL dbt models with the main adapter you specified in your `profiles.yml` and all the Python models are executed by the fal adapter. -##### Vertica connection information +Example: ```yaml -your-profile: +jaffle_shop: + target: dev_with_fal outputs: - dev: - type: vertica # Don't change this! - host: vertica-host-name - port: 5433 # or your custom port (optional) - username: your-username - password: your-password - database: vertica-database-name - schema: your-default-schema - target: dev + dev_with_fal: + type: fal + db_profile: dev_pg # This points to your main adapter + dev_pg: + type: postgres + ... ``` - -By default, `dbt-vertica` will request `ConnectionLoadBalance=true` (which is generally a good thing), and set a session label of `dbt_your-username`. - -There are three options for SSL: `ssl`, `ssl_env_cafile`, and `ssl_uri`. -See their use in the code [here](https://github.com/mpcarter/dbt-vertica/blob/d15f925049dabd2833b4d88304edd216e3f654ed/dbt/adapters/vertica/connections.py#L72-L87). diff --git a/website/docs/reference/warehouse-setups/firebolt-setup.md b/website/docs/docs/core/connect-data-platform/firebolt-setup.md similarity index 96% rename from website/docs/reference/warehouse-setups/firebolt-setup.md rename to website/docs/docs/core/connect-data-platform/firebolt-setup.md index 7c724307131..c7a5a543512 100644 --- a/website/docs/reference/warehouse-setups/firebolt-setup.md +++ b/website/docs/docs/core/connect-data-platform/firebolt-setup.md @@ -1,5 +1,6 @@ --- title: "Firebolt setup" +description: "Read this guide to learn about the Firebolt warehouse setup in dbt." meta: maintained_by: Firebolt authors: 'Firebolt' @@ -11,7 +12,7 @@ meta: slack_channel_name: '#db-firebolt' slack_channel_link: 'https://getdbt.slack.com/archives/C03K2PTHHTP' platform_name: 'Firebolt' - config_page: 'firebolt-configs' + config_page: '/reference/resource-configs/firebolt-configs' --- @@ -52,7 +53,7 @@ For other information including Firebolt feature support, see the [GitHub README ## Connecting to Firebolt -To connect to Firebolt from dbt, you'll need to add a [profile](https://docs.getdbt.com/dbt-cli/configure-your-profile) to your `profiles.yml` file. A Firebolt profile conforms to the following syntax: +To connect to Firebolt from dbt, you'll need to add a [profile](https://docs.getdbt.com/docs/core/connection-profiles) to your `profiles.yml` file. A Firebolt profile conforms to the following syntax: diff --git a/website/docs/docs/core/connect-data-platform/glue-setup.md b/website/docs/docs/core/connect-data-platform/glue-setup.md new file mode 100644 index 00000000000..e56e5bcd902 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/glue-setup.md @@ -0,0 +1,1052 @@ +--- +title: "AWS Glue setup" +description: "Read this guide to learn about the AWS Glue warehouse setup in dbt." +id: "glue-setup" +meta: + maintained_by: Community + authors: 'Benjamin Menuet, Moshir Mikael, Armando Segnini and Amine El Mallem' + github_repo: 'aws-samples/dbt-glue' + pypi_package: 'dbt-glue' + min_core_version: 'v0.24.0' + cloud_support: Not Supported + min_supported_version: 'Glue 2.0' + slack_channel_name: '#db-glue' + slack_channel_link: 'https://getdbt.slack.com/archives/C02R4HSMBAT' + platform_name: 'AWS Glue' + config_page: '/reference/resource-configs/glue-configs' +--- + +:::info Community plugin + +Some core functionality may be limited. If you're interested in contributing, check out the source code for each repository listed below. + +::: + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + +For further (and more likely up-to-date) info, see the [README](https://github.com/aws-samples/dbt-glue#readme) + + +## Connection Methods + + +### Configuring your AWS profile for Glue Interactive Session +There are two IAM principals used with interactive sessions. +- Client principal: The principal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) +from the local client. This is the principal configured in the AWS CLI and is likely the same. +- Service role: The IAM role that AWS Glue uses to execute your session. This is the same as AWS Glue +ETL. + +Read [this documentation](https://docs.aws.amazon.com/glue/latest/dg/glue-is-security.html) to configure these principals. + +You will find below a least privileged policy to enjoy all features of **`dbt-glue`** adapter. + +Please to update variables between **`<>`**, here are explanations of these arguments: + +|Args |Description | +|---|---| +|region|The region where your Glue database is stored | +|AWS Account|The AWS account where you run your pipeline| +|dbt output database|The database updated by dbt (this is the schema configured in the profile.yml of your dbt environment)| +|dbt source database|All databases used as source| +|dbt output bucket|The bucket name where the data will be generated by dbt (the location configured in the profile.yml of your dbt environment)| +|dbt source bucket|The bucket name of source databases (if they are not managed by Lake Formation)| + + + +```yml +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "Read_and_write_databases", + "Action": [ + "glue:SearchTables", + "glue:BatchCreatePartition", + "glue:CreatePartitionIndex", + "glue:DeleteDatabase", + "glue:GetTableVersions", + "glue:GetPartitions", + "glue:DeleteTableVersion", + "glue:UpdateTable", + "glue:DeleteTable", + "glue:DeletePartitionIndex", + "glue:GetTableVersion", + "glue:UpdateColumnStatisticsForTable", + "glue:CreatePartition", + "glue:UpdateDatabase", + "glue:CreateTable", + "glue:GetTables", + "glue:GetDatabases", + "glue:GetTable", + "glue:GetDatabase", + "glue:GetPartition", + "glue:UpdateColumnStatisticsForPartition", + "glue:CreateDatabase", + "glue:BatchDeleteTableVersion", + "glue:BatchDeleteTable", + "glue:DeletePartition", + "glue:GetUserDefinedFunctions", + "lakeformation:ListResources", + "lakeformation:BatchGrantPermissions", + "lakeformation:ListPermissions", + "lakeformation:GetDataAccess", + "lakeformation:GrantPermissions", + "lakeformation:RevokePermissions", + "lakeformation:BatchRevokePermissions", + "lakeformation:AddLFTagsToResource", + "lakeformation:RemoveLFTagsFromResource", + "lakeformation:GetResourceLFTags", + "lakeformation:ListLFTags", + "lakeformation:GetLFTag", + ], + "Resource": [ + "arn:aws:glue:::catalog", + "arn:aws:glue:::table//*", + "arn:aws:glue:::database/" + ], + "Effect": "Allow" + }, + { + "Sid": "Read_only_databases", + "Action": [ + "glue:SearchTables", + "glue:GetTableVersions", + "glue:GetPartitions", + "glue:GetTableVersion", + "glue:GetTables", + "glue:GetDatabases", + "glue:GetTable", + "glue:GetDatabase", + "glue:GetPartition", + "lakeformation:ListResources", + "lakeformation:ListPermissions" + ], + "Resource": [ + "arn:aws:glue:::table//*", + "arn:aws:glue:::database/", + "arn:aws:glue:::database/default", + "arn:aws:glue:::database/global_temp" + ], + "Effect": "Allow" + }, + { + "Sid": "Storage_all_buckets", + "Action": [ + "s3:GetBucketLocation", + "s3:ListBucket" + ], + "Resource": [ + "arn:aws:s3:::", + "arn:aws:s3:::" + ], + "Effect": "Allow" + }, + { + "Sid": "Read_and_write_buckets", + "Action": [ + "s3:PutObject", + "s3:PutObjectAcl", + "s3:GetObject", + "s3:DeleteObject" + ], + "Resource": [ + "arn:aws:s3:::" + ], + "Effect": "Allow" + }, + { + "Sid": "Read_only_buckets", + "Action": [ + "s3:GetObject" + ], + "Resource": [ + "arn:aws:s3:::" + ], + "Effect": "Allow" + } + ] +} +``` + + +### Configuration of the local environment + +Because **`dbt`** and **`dbt-glue`** adapters are compatible with Python versions 3.7, 3.8, and 3.9, check the version of Python: + +```bash +$ python3 --version +``` + +Configure a Python virtual environment to isolate package version and code dependencies: + +```bash +$ sudo yum install git +$ python3 -m venv dbt_venv +$ source dbt_venv/bin/activate +$ python3 -m pip install --upgrade pip +``` + +Configure the last version of AWS CLI + +```bash +$ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +$ unzip awscliv2.zip +$ sudo ./aws/install +``` + +Install boto3 package + +```bash +$ sudo yum install gcc krb5-devel.x86_64 python3-devel.x86_64 -y +$ pip3 install —upgrade boto3 +``` + +Install the package: + +```bash +$ pip3 install dbt-glue +``` + +### Example config + + +```yml +type: glue +query-comment: This is a glue dbt example +role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole +region: us-east-1 +workers: 2 +worker_type: G.1X +idle_timeout: 10 +schema: "dbt_demo" +session_provisioning_timeout_in_seconds: 120 +location: "s3://dbt_demo_bucket/dbt_demo_data" +``` + + + +The table below describes all the options. + +| Option | Description | Mandatory | +|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------| +| project_name | The dbt project name. This must be the same as the one configured in the dbt project. | yes | +| type | The driver to use. | yes | +| query-comment | A string to inject as a comment in each query that dbt runs. | no | +| role_arn | The ARN of the glue interactive session IAM role. | yes | +| region | The AWS Region where you run the data pipeline. | yes | +| workers | The number of workers of a defined workerType that are allocated when a job runs. | yes | +| worker_type | The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. | yes | +| schema | The schema used to organize data stored in Amazon S3.Additionally, is the database in AWS Lake Formation that stores metadata tables in the Data Catalog. | yes | +| session_provisioning_timeout_in_seconds | The timeout in seconds for AWS Glue interactive session provisioning. | yes | +| location | The Amazon S3 location of your target data. | yes | +| query_timeout_in_minutes | The timeout in minutes for a single query. Default is 300 | no | +| idle_timeout | The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time) | no | +| glue_version | The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 3.0. | no | +| security_configuration | The security configuration to use with this session. | no | +| connections | A comma-separated list of connections to use in the session. | no | +| conf | Specific configuration used at the startup of the Glue Interactive Session (arg --conf) | no | +| extra_py_files | Extra python Libs that can be used by the interactive session. | no | +| delta_athena_prefix | A prefix used to create Athena-compatible tables for Delta tables (if not specified, then no Athena-compatible table will be created) | no | +| tags | The map of key-value pairs (tags) belonging to the session. Ex: `KeyName1=Value1,KeyName2=Value2` | no | +| seed_format | By default `parquet`, can be Spark format compatible like `csv` or `json` | no | +| seed_mode | By default `overwrite`, the seed data will be overwritten, you can set it to `append` if you just want to add new data in your dataset | no | +| default_arguments | The map of key-value pairs parameters belonging to the session. More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html). Ex: `--enable-continuous-cloudwatch-log=true,--enable-continuous-log-filter=true` | no | +| glue_session_id | re-use the glue-session to run multiple dbt run commands: set a glue session id you need to use | no | +| glue_session_reuse | Reuse the glue-session to run multiple dbt run commands: If set to true, the glue session will not be closed for re-use. If set to false, the session will be closed | no | +| datalake_formats | The ACID data lake format that you want to use if you are doing merge, can be `hudi`, `ìceberg` or `delta` |no| + +## Configs + +### Configuring tables + +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-spark plugin, in addition to the standard [model configs](/reference/model-configs). + +| Option | Description | Required? | Example | +|---------|----------------------------------------------------|-------------------------|--------------------------| +| file_format | The file format to use when creating tables (`parquet`, `csv`, `json`, `text`, `jdbc` or `orc`). | Optional | `parquet`| +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | +| custom_location | By default, the adapter will store your data in the following path: `location path`/`schema`/`table`. If you don't want to follow that default behaviour, you can use this parameter to set your own custom location on S3 | No | `s3://mycustombucket/mycustompath` | +| hudi_options | When using file_format `hudi`, gives the ability to overwrite any of the default configuration options. | Optional | `{'hoodie.schema.on.read.enable': 'true'}` | +## Incremental models + +dbt seeks to offer useful and intuitive modeling abstractions by means of its built-in configurations and materializations. + +For that reason, the dbt-glue plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: + - **`append`** (default): Insert new records without updating or overwriting any existing data. + - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the table with new data. If no `partition_by` is specified, overwrite the entire table with new data. + - **`merge`** (Apache Hudi and Apache Iceberg only): Match records based on a `unique_key`; update old records, and insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + +Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + +**Notes:** +The default strategy is **`insert_overwrite`** + +### The `append` strategy + +Following the `append` strategy, dbt will perform an `insert into` statement with all new data. The appeal of this strategy is that it is straightforward and functional across all platforms, file types, connection methods, and Apache Spark versions. However, this strategy _cannot_ update, overwrite, or delete existing data, so it is likely to insert duplicate records for many data sources. + +#### Source code +```sql +{{ config( + materialized='incremental', + incremental_strategy='append', +) }} + +-- All rows returned by this query will be appended to the existing table + +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` +#### Run Code +```sql +create temporary view spark_incremental__dbt_tmp as + + select * from analytics.events + + where event_ts >= (select max(event_ts) from {{ this }}) + +; + +insert into table analytics.spark_incremental + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +### The `insert_overwrite` strategy + +This strategy is most effective when specified alongside a `partition_by` clause in your model config. dbt will run an [atomic `insert overwrite` statement](https://spark.apache.org/docs/latest/sql-ref-syntax-dml-insert-overwrite-table.html) that dynamically replaces all partitions included in your query. Be sure to re-select _all_ of the relevant data for a partition when using this incremental strategy. + +If no `partition_by` is specified, then the `insert_overwrite` strategy will atomically replace all contents of the table, overriding all existing data with only the new records. The column schema of the table remains the same, however. This can be desirable in some limited circumstances since it minimizes downtime while the table contents are overwritten. The operation is comparable to running `truncate` + `insert` on other databases. For atomic replacement of Delta-formatted tables, use the `table` materialization (which runs `create or replace`) instead. + +#### Source Code +```sql +{{ config( + materialized='incremental', + partition_by=['date_day'], + file_format='parquet' +) }} + +/* + Every partition returned by this query will be overwritten + when this model runs +*/ + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + date_day, + count(*) as users + +from events +group by 1 +``` + +#### Run Code + +```sql +create temporary view spark_incremental__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + date_day, + count(*) as users + + from events + group by 1 + +; + +insert overwrite table analytics.spark_incremental + partition (date_day) + select `date_day`, `users` from spark_incremental__dbt_tmp +``` + +Specifying `insert_overwrite` as the incremental strategy is optional since it's the default strategy used when none is specified. + +### The `merge` strategy + +**Compatibility:** +- Hudi : OK +- Delta Lake : OK +- Iceberg : OK +- Lake Formation Governed Tables : On going + +NB: + +- For Glue 3: you have to set up a [Glue connectors](https://docs.aws.amazon.com/glue/latest/ug/connectors-chapter.html). + +- For Glue 4: use the `datalake_formats` option in your profile.yml + +When using a connector be sure that your IAM role has these policies: +``` +{ + "Sid": "access_to_connections", + "Action": [ + "glue:GetConnection", + "glue:GetConnections" + ], + "Resource": [ + "arn:aws:glue:::catalog", + "arn:aws:glue:::connection/*" + ], + "Effect": "Allow" +} +``` +and that the managed policy `AmazonEC2ContainerRegistryReadOnly` is attached. +Be sure that you follow the getting started instructions [here](https://docs.aws.amazon.com/glue/latest/ug/setting-up.html#getting-started-min-privs-connectors). + + +This [blog post](https://aws.amazon.com/blogs/big-data/part-1-integrate-apache-hudi-delta-lake-apache-iceberg-datasets-at-scale-aws-glue-studio-notebook/) also explains how to set up and works with Glue Connectors + +#### Hudi + +**Usage notes:** The `merge` with Hudi incremental strategy requires: +- To add `file_format: hudi` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: hudi` + - Alternatively, to add a connection in your profile: `connections: name_of_your_hudi_connector` +- To add Kryo serializer in your Interactive Session Config (in your profile): `conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false` + +dbt will run an [atomic `merge` statement](https://hudi.apache.org/docs/writing_data#spark-datasource-writer) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false + datalake_formats: hudi +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + file_format='hudi', + hudi_options={ + 'hoodie.datasource.write.precombine.field': 'eventtime', + } +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + +#### Delta + +You can also use Delta Lake to be able to use merge feature on tables. + +**Usage notes:** The `merge` with Delta incremental strategy requires: +- To add `file_format: delta` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: delta` + - Alternatively, to add a connection in your profile: `connections: name_of_your_delta_connector` +- To add the following config in your Interactive Session Config (in your profile): `conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog` + +**Athena:** Athena is not compatible by default with delta tables, but you can configure the adapter to create Athena tables on top of your delta table. To do so, you need to configure the two following options in your profile: +- For Delta Lake 2.1.0 supported natively in Glue 4.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar"` +- For Delta Lake 1.0.0 supported natively in Glue 3.0: `extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-1.0.0.jar"` +- `delta_athena_prefix: "the_prefix_of_your_choice"` +- If your table is partitioned, then the addition of new partition is not automatic, you need to perform an `MSCK REPAIR TABLE your_delta_table` after each new partition adding + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: delta + conf: "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" + extra_py_files: "/opt/aws_glue_connectors/selected/datalake/delta-core_2.12-2.1.0.jar" + delta_athena_prefix: "delta" +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key='user_id', + partition_by=['dt'], + file_format='delta' +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen, + current_date() as dt + +from events +group by 1 +``` + +#### Iceberg + +**Usage notes:** The `merge` with Iceberg incremental strategy requires: +- To attach the AmazonEC2ContainerRegistryReadOnly Manged policy to your execution role : +- To add the following policy to your execution role to enable commit locking in a dynamodb table (more info [here](https://iceberg.apache.org/docs/latest/aws/#dynamodb-lock-manager)). Note that the DynamoDB table specified in the resource field of this policy should be the one that is mentioned in your dbt profiles (`--conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable`). By default, this table is named `myGlueLockTable` and is created automatically (with On-Demand Pricing) when running a dbt-glue model with Incremental Materialization and Iceberg file format. If you want to name the table differently or to create your own table without letting Glue do it on your behalf, please provide the `iceberg_glue_commit_lock_table` parameter with your table name (eg. `MyDynamoDbTable`) in your dbt profile. +```yaml +iceberg_glue_commit_lock_table: "MyDynamoDbTable" +``` +- the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + +Make sure you update your conf with `--conf spark.sql.catalog.glue_catalog.lock.table=` and, you change the below iam permission with your correct table name. +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CommitLockTable", + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:BatchGetItem", + "dynamodb:BatchWriteItem", + "dynamodb:ConditionCheckItem", + "dynamodb:PutItem", + "dynamodb:DescribeTable", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan", + "dynamodb:Query", + "dynamodb:UpdateItem" + ], + "Resource": "arn:aws:dynamodb:::table/myGlueLockTable" + } + ] +} +``` +- To add `file_format: Iceberg` in your table configuration +- To add a datalake_formats in your profile : `datalake_formats: iceberg` + - Alternatively, to add connections in your profile: `connections: name_of_your_iceberg_connector` ( + - For Athena version 3: + - The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.14.0 (Oct 11, 2022) as Software version) + - the latest connector for iceberg in AWS marketplace uses Ver 0.14.0 for Glue 3.0, and Ver 1.2.1 for Glue 4.0 where Kryo serialization fails when writing iceberg, use "org.apache.spark.serializer.JavaSerializer" for spark.serializer instead, more info [here](https://github.com/apache/iceberg/pull/546) + - For Athena version 2: The adapter is compatible with the Iceberg Connector from AWS Marketplace with Glue 3.0 as Fulfillment option and 0.12.0-2 (Feb 14, 2022) as Software version) +- To add the following config in your Interactive Session Config (in your profile): +```--conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer + --conf spark.sql.warehouse=s3:// + --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO + --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager + --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + - For Glue 3.0, set `spark.sql.catalog.glue_catalog.lock-impl` to `org.apache.iceberg.aws.glue.DynamoLockManager` instead + +dbt will run an [atomic `merge` statement](https://iceberg.apache.org/docs/latest/spark-writes/) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. You need to provide a `unique_key` to perform merge operation otherwise it will fail. This key is to provide in a Python list format and can contains multiple column name to create a composite unique_key. + +##### Notes +- When using a custom_location in Iceberg, avoid to use final trailing slash. Adding a final trailing slash lead to an un-proper handling of the location, and issues when reading the data from query engines like Trino. The issue should be fixed for Iceberg version > 0.13. Related Github issue can be find [here](https://github.com/apache/iceberg/issues/4582). +- Iceberg also supports `insert_overwrite` and `append` strategies. +- The `warehouse` conf must be provided, but it's overwritten by the adapter `location` in your profile or `custom_location` in model configuration. +- By default, this materialization has `iceberg_expire_snapshots` set to 'True', if you need to have historical auditable changes, set: `iceberg_expire_snapshots='False'`. +- Currently, due to some dbt internal, the iceberg catalog used internally when running glue interactive sessions with dbt-glue has a hardcoded name `glue_catalog`. This name is an alias pointing to the AWS Glue Catalog but is specific to each session. If you want to interact with your data in another session without using dbt-glue (from a Glue Studio notebook, for example), you can configure another alias (ie. another name for the Iceberg Catalog). To illustrate this concept, you can set in your configuration file : +``` +--conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +And then run in an AWS Glue Studio Notebook a session with the following config: +``` +--conf spark.sql.catalog.AnotherRandomCatalogName=org.apache.iceberg.spark.SparkCatalog +``` +In both cases, the underlying catalog would be the AWS Glue Catalog, unique in your AWS Account and Region, and you would be able to work with the exact same data. Also make sure that if you change the name of the Glue Catalog Alias, you change it in all the other `--conf` where it's used: +``` + --conf spark.sql.catalog.RandomCatalogName=org.apache.iceberg.spark.SparkCatalog + --conf spark.sql.catalog.RandomCatalogName.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog + ... + --conf spark.sql.catalog.RandomCatalogName.lock-impl=org.apache.iceberg.aws.glue.DynamoLockManager +``` +- A full reference to `table_properties` can be found [here](https://iceberg.apache.org/docs/latest/configuration/). +- Iceberg Tables are natively supported by Athena. Therefore, you can query tables created and operated with dbt-glue adapter from Athena. +- Incremental Materialization with Iceberg file format supports dbt snapshot. You are able to run a dbt snapshot command that queries an Iceberg Table and create a dbt fashioned snapshot of it. + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + datalake_formats: iceberg + conf: --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions --conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.warehouse=s3://aws-dbt-glue-datalake-1234567890-eu-west-1/dbt_test_project --conf spark.sql.catalog.glue_catalog=org.apache.iceberg.spark.SparkCatalog --conf spark.sql.catalog.glue_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog --conf spark.sql.catalog.glue_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO --conf spark.sql.catalog.glue_catalog.lock-impl=org.apache.iceberg.aws.dynamodb.DynamoDbLockManager --conf spark.sql.catalog.glue_catalog.lock.table=myGlueLockTable --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +``` + +#### Source Code example +```sql +{{ config( + materialized='incremental', + incremental_strategy='merge', + unique_key=['user_id'], + file_format='iceberg', + iceberg_expire_snapshots='False', + partition_by=['status'] + table_properties={'write.target-file-size-bytes': '268435456'} +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` +#### Iceberg Snapshot source code example +```sql + +{% snapshot demosnapshot %} + +{{ + config( + strategy='timestamp', + target_schema='jaffle_db', + updated_at='dt', + file_format='iceberg' +) }} + +select * from {{ ref('customers') }} + +{% endsnapshot %} + +``` + +## Monitoring your Glue Interactive Session + +Monitoring is an important part of maintaining the reliability, availability, +and performance of AWS Glue and your other AWS solutions. AWS provides monitoring +tools that you can use to watch AWS Glue, identify the required number of workers +required for your Glue Interactive Session, report when something is wrong and +take action automatically when appropriate. AWS Glue provides Spark UI, +and CloudWatch logs and metrics for monitoring your AWS Glue jobs. +More information on: [Monitoring AWS Glue Spark jobs](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark.html) + +**Usage notes:** Monitoring requires: +- To add the following IAM policy to your IAM role: +``` +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudwatchMetrics", + "Effect": "Allow", + "Action": "cloudwatch:PutMetricData", + "Resource": "*", + "Condition": { + "StringEquals": { + "cloudwatch:namespace": "Glue" + } + } + }, + { + "Sid": "CloudwatchLogs", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "logs:CreateLogStream", + "logs:CreateLogGroup", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:/aws-glue/*", + "arn:aws:s3:::bucket-to-write-sparkui-logs/*" + ] + } + ] +} +``` + +- To add monitoring parameters in your Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "4.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-metrics=true, --enable-continuous-cloudwatch-log=true, --enable-continuous-log-filter=true, --enable-spark-ui=true, --spark-event-logs-path=s3://bucket-to-write-sparkui-logs/dbt/" +``` + +If you want to use the Spark UI, you can launch the Spark history server using a +AWS CloudFormation template that hosts the server on an EC2 instance, +or launch locally using Docker. More information on [Launching the Spark history server](https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-history.html#monitor-spark-ui-history-local) + +## Enabling AWS Glue Auto Scaling +Auto Scaling is available since AWS Glue version 3.0 or later. More information +on the following AWS blog post: ["Introducing AWS Glue Auto Scaling: Automatically resize serverless computing resources for lower cost with optimized Apache Spark"](https://aws.amazon.com/blogs/big-data/introducing-aws-glue-auto-scaling-automatically-resize-serverless-computing-resources-for-lower-cost-with-optimized-apache-spark/) + +With Auto Scaling enabled, you will get the following benefits: + +* AWS Glue automatically adds and removes workers from the cluster depending on the parallelism at each stage or microbatch of the job run. + +* It removes the need for you to experiment and decide on the number of workers to assign for your AWS Glue Interactive sessions. + +* Once you choose the maximum number of workers, AWS Glue will choose the right size resources for the workload. + +* You can see how the size of the cluster changes during the Glue Interactive sessions run by looking at CloudWatch metrics. +More information on [Monitoring your Glue Interactive Session](#Monitoring-your-Glue-Interactive-Session). + +**Usage notes:** AWS Glue Auto Scaling requires: +- To set your AWS Glue version 3.0 or later. +- To set the maximum number of workers (if Auto Scaling is enabled, the `workers` +parameter sets the maximum number of workers) +- To set the `--enable-auto-scaling=true` parameter on your Glue Interactive Session Config (in your profile). +More information on [Job parameters used by AWS Glue](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html) + +#### Profile config example +```yaml +test_project: + target: dev + outputs: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + default_arguments: "--enable-auto-scaling=true" +``` + +## Access Glue catalog in another AWS account +In many cases, you may need to run you dbt jobs to read from another AWS account. + +Review the following link https://repost.aws/knowledge-center/glue-tables-cross-accounts to set up access policies in source and target accounts + +Add the following `"spark.hadoop.hive.metastore.glue.catalogid="` to your conf in the DBT profile, as such, you can have multiple outputs for each of the accounts that you have access to. + +Note: The access cross-accounts need to be within the same AWS Region +#### Profile config example +```yaml +test_project: + target: dev + outputsAccountB: + dev: + type: glue + query-comment: my comment + role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole + region: eu-west-1 + glue_version: "3.0" + workers: 2 + worker_type: G.1X + schema: "dbt_test_project" + session_provisioning_timeout_in_seconds: 120 + location: "s3://aws-dbt-glue-datalake-1234567890-eu-west-1/" + conf: "--conf hive.metastore.client.factory.class=com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory + --conf spark.hadoop.hive.metastore.glue.catalogid=" +``` + +## Persisting model descriptions + +Relation-level docs persistence is supported since dbt v0.17.0. For more +information on configuring docs persistence, see [the docs](/reference/resource-configs/persist_docs). + +When the `persist_docs` option is configured appropriately, you'll be able to +see model descriptions in the `Comment` field of `describe [table] extended` +or `show table extended in [database] like '*'`. + +## Always `schema`, never `database` + +Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands +`database` to exist at a higher level than `schema`. As such, you should _never_ +use or set `database` as a node config or in the target profile when running dbt-glue. + +If you want to control the schema/database in which dbt will materialize models, +use the `schema` config and `generate_schema_name` macro _only_. +For more information, check the dbt documentation about [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas). + +## AWS Lakeformation integration +The adapter supports AWS Lake Formation tags management enabling you to associate existing tags defined out of dbt-glue to database objects built by dbt-glue (database, table, view, snapshot, incremental models, seeds). + +- You can enable or disable lf-tags management via config, at model and dbt-project level (disabled by default) +- If enabled, lf-tags will be updated on every dbt run. There are table level lf-tags configs and column-level lf-tags configs. +- You can specify that you want to drop existing database, table column Lake Formation tags by setting the drop_existing config field to True (False by default, meaning existing tags are kept) +- Please note that if the tag you want to associate with the table does not exist, the dbt-glue execution will throw an error + +The adapter also supports AWS Lakeformation data cell filtering. +- You can enable or disable data-cell filtering via config, at model and dbt-project level (disabled by default) +- If enabled, data_cell_filters will be updated on every dbt run. +- You can specify that you want to drop existing table data-cell filters by setting the drop_existing config field to True (False by default, meaning existing filters are kept) +- You can leverage excluded_columns_names **OR** columns config fields to perform Column level security as well. **Please note that you can use one or the other but not both**. +- By default, if you don't specify any column or excluded_columns, dbt-glue does not perform Column level filtering and let the principal access all the columns. + +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns specified ('customer_id', 'first_order', 'most_recent_order', 'number_of_orders') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'column_names': ['customer_id', 'first_order', 'most_recent_order', 'number_of_orders'] + } + }, + } + } +``` +The below configuration let the specified principal (lf-data-scientist IAM user) access rows that have a customer_lifetime_value > 15 and all the columns *except* the one specified ('first_name') + +```sql +lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +``` + +See below some examples of how you can integrate LF Tags management and data cell filtering to your configurations : + +#### At model level +This way of defining your Lakeformation rules is appropriate if you want to handle the tagging and filtering policy at object level. Remember that it overrides any configuration defined at dbt-project level. + +```sql +{{ config( + materialized='incremental', + unique_key="customer_id", + incremental_strategy='append', + lf_tags_config={ + 'enabled': true, + 'drop_existing' : False, + 'tags_database': + { + 'name_of_my_db_tag': 'value_of_my_db_tag' + }, + 'tags_table': + { + 'name_of_my_table_tag': 'value_of_my_table_tag' + }, + 'tags_columns': { + 'name_of_my_lf_tag': { + 'value_of_my_tag': ['customer_id', 'customer_lifetime_value', 'dt'] + }}}, + lf_grants={ + 'data_cell_filters': { + 'enabled': True, + 'drop_existing' : True, + 'filters': { + 'the_name_of_my_filter': { + 'row_filter': 'customer_lifetime_value>15', + 'principals': ['arn:aws:iam::123456789:user/lf-data-scientist'], + 'excluded_column_names': ['first_name'] + } + }, + } + } +) }} + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order, + customer_orders.most_recent_order, + customer_orders.number_of_orders, + customer_payments.total_amount as customer_lifetime_value, + current_date() as dt + + from customers + + left join customer_orders using (customer_id) + + left join customer_payments using (customer_id) + +``` + +#### At dbt-project level +This way you can specify tags and data filtering policy for a particular path in your dbt project (eg. models, seeds, models/model_group1, etc.) +This is especially useful for seeds, for which you can't define configuration in the file directly. + +```yml +seeds: + +lf_tags_config: + enabled: true + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' +models: + +lf_tags_config: + enabled: true + drop_existing: True + tags_database: + name_of_my_database_tag: 'value_of_my_database_tag' + tags_table: + name_of_my_table_tag: 'value_of_my_table_tag' +``` + +## Tests + +To perform a functional test: +1. Install dev requirements: +```bash +$ pip3 install -r dev-requirements.txt +``` + +2. Install dev locally +```bash +$ python3 setup.py build && python3 setup.py install_lib +``` + +3. Export variables +```bash +$ export DBT_S3_LOCATION=s3://mybucket/myprefix +$ export DBT_ROLE_ARN=arn:aws:iam::1234567890:role/GlueInteractiveSessionRole +``` + +4. Run the test +```bash +$ python3 -m pytest tests/functional +``` + +For more information, check the dbt documentation about [testing a new adapter](https://docs.getdbt.com/docs/contributing/testing-a-new-adapter). + +## Caveats + +### Supported Functionality + +Most dbt Core functionality is supported, but some features are only available with Apache Hudi. + +Apache Hudi-only features: +1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](/reference/resource-configs/glue-configs#the-merge-strategy)) + + +Some dbt features, available on the core adapters, are not yet supported on Glue: +1. [Persisting](/reference/resource-configs/persist_docs) column-level descriptions as database comments +2. [Snapshots](/docs/build/snapshots) diff --git a/website/docs/reference/warehouse-setups/greenplum-setup.md b/website/docs/docs/core/connect-data-platform/greenplum-setup.md similarity index 95% rename from website/docs/reference/warehouse-setups/greenplum-setup.md rename to website/docs/docs/core/connect-data-platform/greenplum-setup.md index 2d708bb3d5f..06ada19a1e9 100644 --- a/website/docs/reference/warehouse-setups/greenplum-setup.md +++ b/website/docs/docs/core/connect-data-platform/greenplum-setup.md @@ -1,5 +1,6 @@ --- title: "Greenplum setup" +description: "Read this guide to learn about the Greenplum warehouse setup in dbt." id: "greenplum-setup" meta: maintained_by: Community @@ -12,7 +13,7 @@ meta: slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'Greenplum' - config_page: 'greenplum-configs' + config_page: '/reference/resource-configs/greenplum-configs' ---

    Overview of {frontMatter.meta.pypi_package}

    diff --git a/website/docs/reference/warehouse-setups/hive-setup.md b/website/docs/docs/core/connect-data-platform/hive-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/hive-setup.md rename to website/docs/docs/core/connect-data-platform/hive-setup.md index e798bf37fe2..61a929c58da 100644 --- a/website/docs/reference/warehouse-setups/hive-setup.md +++ b/website/docs/docs/core/connect-data-platform/hive-setup.md @@ -1,5 +1,6 @@ --- title: "Apache Hive setup" +description: "Read this guide to learn about the Apache Hive warehouse setup in dbt." id: "hive-setup" meta: maintained_by: Cloudera @@ -12,7 +13,7 @@ meta: slack_channel_name: '#db-hive' slack_channel_link: 'https://getdbt.slack.com/archives/C0401DTNSKW' platform_name: 'Hive' - config_page: 'hive-configs' + config_page: '/reference/resource-configs/hive-configs' ---

    Overview of {frontMatter.meta.pypi_package}

    diff --git a/website/docs/reference/warehouse-setups/ibmdb2-setup.md b/website/docs/docs/core/connect-data-platform/ibmdb2-setup.md similarity index 97% rename from website/docs/reference/warehouse-setups/ibmdb2-setup.md rename to website/docs/docs/core/connect-data-platform/ibmdb2-setup.md index c601b1cb763..cb6c7459418 100644 --- a/website/docs/reference/warehouse-setups/ibmdb2-setup.md +++ b/website/docs/docs/core/connect-data-platform/ibmdb2-setup.md @@ -1,5 +1,6 @@ --- title: "IBM DB2 setup" +description: "Read this guide to learn about the IBM DB2 warehouse setup in dbt." id: "ibmdb2-setup" meta: maintained_by: Community @@ -12,7 +13,7 @@ meta: slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'IBM DB2' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Community plugin diff --git a/website/docs/reference/warehouse-setups/impala-setup.md b/website/docs/docs/core/connect-data-platform/impala-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/impala-setup.md rename to website/docs/docs/core/connect-data-platform/impala-setup.md index 121eca94310..0a0f1b955a1 100644 --- a/website/docs/reference/warehouse-setups/impala-setup.md +++ b/website/docs/docs/core/connect-data-platform/impala-setup.md @@ -1,5 +1,6 @@ --- title: "Apache Impala setup" +description: "Read this guide to learn about the Apache Impala warehouse setup in dbt." id: "impala-setup" meta: maintained_by: Cloudera @@ -12,7 +13,7 @@ meta: slack_channel_name: '#db-impala' slack_channel_link: 'https://getdbt.slack.com/archives/C01PWAH41A5' platform_name: 'Impala' - config_page: 'impala-configs' + config_page: '/reference/resource-configs/impala-configs' ---

    Overview of {frontMatter.meta.pypi_package}

    diff --git a/website/docs/docs/core/connect-data-platform/infer-setup.md b/website/docs/docs/core/connect-data-platform/infer-setup.md new file mode 100644 index 00000000000..430c5e47f85 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/infer-setup.md @@ -0,0 +1,149 @@ +--- +title: "Infer setup" +description: "Read this guide to learn about the Infer warehouse setup in dbt." +id: "infer-setup" +meta: + maintained_by: Infer + authors: Erik Mathiesen-Dreyfus, Ryan Garland + github_repo: 'inferlabs/dbt-infer' + pypi_package: 'dbt-infer' + min_core_version: 'v1.2.0' + cloud_support: Not Supported + slack_channel_name: n/a + slack_channel_link: + platform_name: 'Infer' + config_page: '/reference/resource-configs/no-configs' + min_supported_version: n/a +--- + +

    Overview of {frontMatter.meta.pypi_package}

    + +
      +
    • Maintained by: {frontMatter.meta.maintained_by}
    • +
    • Authors: {frontMatter.meta.authors}
    • +
    • GitHub repo: {frontMatter.meta.github_repo}
    • +
    • PyPI package: {frontMatter.meta.pypi_package}
    • +
    • Slack channel: {frontMatter.meta.slack_channel_name}
    • +
    • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
    • +
    • dbt Cloud support: {frontMatter.meta.cloud_support}
    • +
    • Minimum data platform version: {frontMatter.meta.min_supported_version}
    • +
    + + +

    Installing {frontMatter.meta.pypi_package}

    + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

    Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

    + +

    Configuring {frontMatter.meta.pypi_package}

    + +

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    + + +## Connecting to Infer with **dbt-infer** + +Infer allows you to perform advanced ML Analytics within SQL as if native to your data warehouse. +To do this Infer uses a variant called SQL-inf, which defines as set of primitive ML commands from which +you can build advanced analysis for any business use case. +Read more about SQL-inf and Infer in the [Infer documentation](https://docs.getinfer.io/). + +The `dbt-infer` package allow you to use SQL-inf easily within your DBT models. +You can read more about the `dbt-infer` package itself and how it connecst to Infer in the [dbt-infer documentation](https://dbt.getinfer.io/). + +Before using SQL-inf in your DBT models you need to setup an Infer account and generate an API-key for the connection. +You can read how to do that in the [Getting Started Guide](https://dbt.getinfer.io/docs/getting_started#sign-up-to-infer). + +The profile configuration in `profiles.yml` for `dbt-infer` should look something like this: + + + +```yaml +: + target: + outputs: + : + type: infer + url: "" + username: "" + apikey: "" + data_config: + [configuration for your underlying data warehouse] +``` + + + +Note that you need to also have installed the adapter package for your underlying data warehouse. +For example, if your data warehouse is BigQuery then you need to also have installed the appropriate `dbt-bigquery` package. +The configuration of this goes into the `data_config` field. + +### Description of Infer Profile Fields + +| Field | Required | Description | +|------------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------| +| `type` | Yes | Must be set to `infer`. This must be included either in `profiles.yml` or in the `dbt_project.yml` file. | +| `url` | Yes | The host name of the Infer server to connect to. Typically this is `https://app.getinfer.io`. | +| `username` | Yes | Your Infer username - the one you use to login. | +| `apikey` | Yes | Your Infer api key. | +| `data_config` | Yes | The configuration for your underlying data warehouse. The format of this follows the format of the configuration for your data warehouse adapter. | + + +### Example of Infer configuration + +To illustrate the above descriptions, here is an example of what a `dbt-infer` configuration might look like. +In this case the underlying data warehouse is BigQuery, which we configure the adapter for inside the `data_config` field. + +```yaml +infer_bigquery: + apikey: 1234567890abcdef + username: my_name@example.com + url: https://app.getinfer.io + type: infer + data_config: + dataset: my_dataset + job_execution_timeout_seconds: 300 + job_retries: 1 + keyfile: bq-user-creds.json + location: EU + method: service-account + priority: interactive + project: my-bigquery-project + threads: 1 + type: bigquery +``` + +## Usage + +You do not need to change anything in your existing DBT models when switching to use SQL-inf – +they will all work the same as before – but you now have the ability to use SQL-inf commands +as native SQL functions. + +Infer supports a number of SQL-inf commands, including +`PREDICT`, `EXPLAIN`, `CLUSTER`, `SIMILAR_TO`, `TOPICS`, `SENTIMENT`. +You can read more about SQL-inf and the commands it supports in the [SQL-inf Reference Guide](https://docs.getinfer.io/docs/reference). + +To get you started we will give a brief example here of what such a model might look like. +You can find other more complex examples on the [dbt-infer examples page](https://dbt.getinfer.io/docs/examples). + +In our simple example, we will show how to use a previous model 'user_features' to predict churn +by predicting the column `has_churned`. + +```sql title="predict_user_churn.sql" +{{ + config( + materialized = "table" + ) +}} + +with predict_user_churn_input as ( + select * from {{ ref('user_features') }} +) + +SELECT * FROM predict_user_churn_input PREDICT(has_churned, ignore=user_id) +``` + +Not that we ignore `user_id` from the prediction. +This is because we think that the `user_id` might, and should, not influence our prediction of churn, so we remove it. +We also use the convention of pulling together the inputs for our prediction in a CTE, named `predict_user_churn_input`. diff --git a/website/docs/reference/warehouse-setups/iomete-setup.md b/website/docs/docs/core/connect-data-platform/iomete-setup.md similarity index 97% rename from website/docs/reference/warehouse-setups/iomete-setup.md rename to website/docs/docs/core/connect-data-platform/iomete-setup.md index bbc02a56683..bc015141c85 100644 --- a/website/docs/reference/warehouse-setups/iomete-setup.md +++ b/website/docs/docs/core/connect-data-platform/iomete-setup.md @@ -1,5 +1,6 @@ --- title: "iomete setup" +description: "Read this guide to learn about the iomete warehouse setup in dbt." id: "iomete-setup" meta: maintained_by: iomete @@ -12,7 +13,7 @@ meta: slack_channel_name: '##db-iomete' slack_channel_link: 'https://getdbt.slack.com/archives/C03JFG22EP9' platform_name: 'iomete' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' ---

    Overview of {frontMatter.meta.pypi_package}

    diff --git a/website/docs/reference/warehouse-setups/layer-setup.md b/website/docs/docs/core/connect-data-platform/layer-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/layer-setup.md rename to website/docs/docs/core/connect-data-platform/layer-setup.md index f1a9b8b8940..f065c0c7313 100644 --- a/website/docs/reference/warehouse-setups/layer-setup.md +++ b/website/docs/docs/core/connect-data-platform/layer-setup.md @@ -1,5 +1,6 @@ --- title: "Layer setup" +description: "Read this guide to learn about the Layer warehouse setup in dbt." id: "layer-setup" meta: maintained_by: Layer @@ -12,7 +13,7 @@ meta: slack_channel_name: '#tools-layer' slack_channel_link: 'https://getdbt.slack.com/archives/C03STA39TFE' platform_name: 'Layer' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- diff --git a/website/docs/reference/warehouse-setups/materialize-setup.md b/website/docs/docs/core/connect-data-platform/materialize-setup.md similarity index 80% rename from website/docs/reference/warehouse-setups/materialize-setup.md rename to website/docs/docs/core/connect-data-platform/materialize-setup.md index 684f7174a9f..c8777c29490 100644 --- a/website/docs/reference/warehouse-setups/materialize-setup.md +++ b/website/docs/docs/core/connect-data-platform/materialize-setup.md @@ -1,17 +1,19 @@ --- title: "Materialize setup" +description: "Read this guide to learn about the Materialize warehouse setup in dbt." id: "materialize-setup" meta: maintained_by: Materialize Inc. + pypi_package: 'dbt-materialize' authors: 'Materialize team' github_repo: 'MaterializeInc/materialize/blob/main/misc/dbt-materialize' - min_core_version: 'v0.18.0' + min_core_version: 'v0.18.1' + min_supported_version: 'v0.28.0' cloud_support: Not Supported - min_supported_version: 'n/a' slack_channel_name: '#db-materialize' slack_channel_link: 'https://getdbt.slack.com/archives/C01PWAH41A5' platform_name: 'Materialize' - config_page: 'materialize-configs' + config_page: '/reference/resource-configs/materialize-configs' --- :::info Vendor-supported plugin @@ -43,7 +45,7 @@ pip is the easiest way to install the adapter:

    Configuring {frontMatter.meta.pypi_package}

    -

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    +

    For {frontMatter.meta.platform_name}-specifc configuration, please refer to {frontMatter.meta.platform_name} Configuration.

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    @@ -54,7 +56,7 @@ Once you have set up a [Materialize account](https://materialize.com/register/), ```yaml -dbt-materialize: +materialize: target: dev outputs: dev: @@ -67,9 +69,9 @@ dbt-materialize: cluster: [cluster] # default 'default' schema: [dbt schema] sslmode: require - keepalives_idle: 0 # default 0, indicating the system default - connect_timeout: 10 # default 10 seconds - retries: 1 # default 1 retry on error/timeout when opening connections + keepalives_idle: 0 # default: 0, indicating the system default + connect_timeout: 10 # default: 10 seconds + retries: 1 # default: 1, retry on error/timeout when opening connections ``` @@ -106,16 +108,16 @@ Type | Supported? | Details ### Indexes -Materialized views (`materializedview`), views (`view`) and sources (`source`) may have a list of [`indexes`](resource-configs/materialize-configs/indexes) defined. +Materialized views (`materializedview`), views (`view`) and sources (`source`) may have a list of [`indexes`](/reference/resource-configs/materialize-configs#indexes) defined. ### Seeds -Running [`dbt seed`](commands/seed) will create a static materialized from a CSV file. You will not be able to add to or update this view after it has been created. +Running [`dbt seed`](/reference/commands/seed) will create a static materialized from a CSV file. You will not be able to add to or update this view after it has been created. ### Tests -Running [`dbt test`](commands/test) with the optional `--store-failures` flag or [`store_failures` config](resource-configs/store_failures) will create a materialized view for each test you've chosen to store. This view is a continuously updating representation of failures. +Running [`dbt test`](/reference/commands/test) with the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures) will create a materialized view for each configured test that can keep track of failures over time. ## Resources -- [dbt and Materialize guide](https://materialize.com/docs/guides/dbt/) \ No newline at end of file +- [dbt and Materialize guide](https://materialize.com/docs/guides/dbt/) diff --git a/website/docs/reference/warehouse-setups/mindsdb-setup.md b/website/docs/docs/core/connect-data-platform/mindsdb-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/mindsdb-setup.md rename to website/docs/docs/core/connect-data-platform/mindsdb-setup.md index 9c5a691aa80..e6b8c5decaa 100644 --- a/website/docs/reference/warehouse-setups/mindsdb-setup.md +++ b/website/docs/docs/core/connect-data-platform/mindsdb-setup.md @@ -12,7 +12,7 @@ meta: slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'MindsDB' - config_page: 'mindsdb-configs' + config_page: '/reference/resource-configs/mindsdb-configs' --- :::info Vendor-supported plugin @@ -48,7 +48,6 @@ pip is the easiest way to install the adapter:

    For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

    For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

    -s ## Configurations diff --git a/website/docs/reference/warehouse-setups/mssql-setup.md b/website/docs/docs/core/connect-data-platform/mssql-setup.md similarity index 65% rename from website/docs/reference/warehouse-setups/mssql-setup.md rename to website/docs/docs/core/connect-data-platform/mssql-setup.md index 6d786d9349b..5efcc454823 100644 --- a/website/docs/reference/warehouse-setups/mssql-setup.md +++ b/website/docs/docs/core/connect-data-platform/mssql-setup.md @@ -1,5 +1,6 @@ --- title: "Microsoft SQL Server setup" +description: "Read this guide to learn about the Microsoft SQL Server warehouse setup in dbt." id: "mssql-setup" meta: maintained_by: Community @@ -12,7 +13,7 @@ meta: slack_channel_name: '#db-sqlserver' slack_channel_link: 'https://getdbt.slack.com/archives/CMRMDDQ9W' platform_name: 'SQL Server' - config_page: 'mssql-configs' + config_page: '/reference/resource-configs/mssql-configs' --- :::info Community plugin @@ -65,27 +66,39 @@ sudo apt install unixodbc-dev Download and install the [Microsoft ODBC Driver 18 for SQL Server](https://docs.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver15). If you already have ODBC Driver 17 installed, then that one will work as well. -The adapter is tested with SQL Server 2017, SQL Server 2019, SQL Server 2022 and Azure SQL Database. These versions are tested with Microsoft ODBC Driver 17 and Microsoft ODBC Driver 18. +#### Supported configurations + +* The adapter is tested with SQL Server 2017, SQL Server 2019, SQL Server 2022 and Azure SQL Database. +* We test all combinations with Microsoft ODBC Driver 17 and Microsoft ODBC Driver 18. +* The collations we run our tests on are `SQL_Latin1_General_CP1_CI_AS` and `SQL_Latin1_General_CP1_CS_AS`. + +The adapter support is not limited to the matrix of the above configurations. If you notice an issue with any other configuration, let us know by opening an issue on [GitHub](https://github.com/dbt-msft/dbt-sqlserver). ## Authentication methods & profile configuration ### Common configuration -For all the authentication methods below, the following configuration options can be set in your `profiles.yml` file: - -* `driver`: The ODBC driver to use. E.g. `ODBC Driver 18 for SQL Server` -* `server`: The server hostname. E.g. `localhost` -* `port`: The server port. E.g. `1433` -* `database`: The database name. -* `schema`: The schema name. E.g. `dbo` -* `retries`: The number of automatic times to retry a query before failing. Defaults to `1`. Note that queries with syntax errors will not be retried. This setting can be used to overcome intermittent network issues. -* `encrypt`: Whether to encrypt the connection to the server. Defaults to `true`. Read more about encryption [below](#connection-encryption). -* `trust_cert`: Whether to trust the server certificate. Defaults to `false`. Read more about encryption [below](#connection-encryption). +For all the authentication methods, refer to the following configuration options that can be set in your `profiles.yml` file. +A complete reference of all options can be found [at the end of this page](#reference-of-all-connection-options). + +| Configuration option | Description | Type | Example | +| --------------------- | ---- | ---- | ------- | +| `driver` | The ODBC driver to use | Required | `ODBC Driver 18 for SQL Server` | +| `server` | The server hostname | Required | `localhost` | +| `port` | The server port | Required | `1433` | +| `database` | The database name | Required | Not applicable | +| `schema` | The schema name | Required | `dbo` | +| `retries` | The number of automatic times to retry a query before failing. Defaults to `1`. Queries with syntax errors will not be retried. This setting can be used to overcome intermittent network issues. | Optional | Not applicable | +| `login_timeout` | The number of seconds used to establish a connection before failing. Defaults to `0`, which means that the timeout is disabled or uses the default system settings. | Optional | Not applicable | +| `query_timeout` | The number of seconds used to wait for a query before failing. Defaults to `0`, which means that the timeout is disabled or uses the default system settings. | Optional | Not applicable | +| `schema_authorization` | Optionally set this to the principal who should own the schemas created by dbt. [Read more about schema authorization](#schema-authorization). | Optional | Not applicable | +| `encrypt` | Whether to encrypt the connection to the server. Defaults to `true`. Read more about [connection encryption](#connection-encryption). | Optional | Not applicable | +| `trust_cert` | Whether to trust the server certificate. Defaults to `false`. Read more about [connection encryption](#connection-encryption).| Optional | Not applicable | ### Connection encryption Microsoft made several changes in the release of ODBC Driver 18 that affects how connection encryption is configured. -To accommodate these changes, starting in dbt-sqlserver 1.2.0 or newer the default vallues of `encrypt` and `trust_cert` have changed. +To accommodate these changes, starting in dbt-sqlserver 1.2.0 or newer the default values of `encrypt` and `trust_cert` have changed. Both of these settings will now **always** be included in the connection string to the server, regardless if you've left them out of your profile configuration or not. * The default value of `encrypt` is `true`, meaning that connections are encrypted by default. @@ -426,25 +439,39 @@ In Azure SQL, you can sign in using AAD authentication, but to be able to grant Note that principals will not be deleted automatically when they are removed from the `grants` block. +### Schema authorization + +You can optionally set the principal who should own all schemas created by dbt. This is then used in the `CREATE SCHEMA` statement like so: + +```sql +CREATE SCHEMA [schema_name] AUTHORIZATION [schema_authorization] +``` + +A common use case is to use this when you are authenticating with a principal who has permissions based on a group, such as an AAD group. When that principal creates a schema, the server will first try to create an individual login for this principal and then link the schema to that principal. If you would be using Azure AD in this case, +then this would fail since Azure SQL can't create logins for individuals part of an AD group automatically. + ### Reference of all connection options -| configuration option | description | required | default value | -|----------------------|-------------------------------------------------------------------------------------------------------------------------------------------------|--------------------|---------------| -| `driver` | The ODBC driver to use. | :white_check_mark: | | -| `host` | The hostname of the database server. | :white_check_mark: | | -| `port` | The port of the database server. | | `1433` | -| `database` | The name of the database to connect to. | :white_check_mark: | | -| `schema` | The schema to use. | :white_check_mark: | | -| `authentication` | The authentication method to use. This is not required for Windows authentication. | | `'sql'` | | | | -| `UID` | Username used to authenticate. This can be left out depending on the authentication method. | | | -| `PWD` | Password used to authenticate. This can be left out depending on the authentication method. | | | -| `windows_login` | Set this to `true` to use Windows authentication. This is only available for SQL Server. | | | -| `tenant_id` | The tenant ID of the Azure Active Directory instance. This is only used when connecting to Azure SQL with a service principal. | | | -| `client_id` | The client ID of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | -| `client_secret` | The client secret of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | -| `encrypt` | Set this to `false` to disable the use of encryption. See [above](#connection-encryption). | | `true` | -| `trust_cert` | Set this to `true` to trust the server certificate. See [above](#connection-encryption). | | `false` | -| `retries` | The number of times to retry a failed connection. | | `1` | +| Configuration option | Description | Required | Default value | +| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------- | +| `driver` | The ODBC driver to use. | :white_check_mark: | | +| `host` | The hostname of the database server. | :white_check_mark: | | +| `port` | The port of the database server. | | `1433` | +| `database` | The name of the database to connect to. | :white_check_mark: | | +| `schema` | The schema to use. | :white_check_mark: | | +| `authentication` | The authentication method to use. This is not required for Windows authentication. | | `'sql'` | +| `UID` | Username used to authenticate. This can be left out depending on the authentication method. | | | +| `PWD` | Password used to authenticate. This can be left out depending on the authentication method. | | | +| `windows_login` | Set this to `true` to use Windows authentication. This is only available for SQL Server. | | | +| `tenant_id` | The tenant ID of the Azure Active Directory instance. This is only used when connecting to Azure SQL with a service principal. | | | +| `client_id` | The client ID of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | +| `client_secret` | The client secret of the Azure Active Directory service principal. This is only used when connecting to Azure SQL with an AAD service principal. | | | +| `encrypt` | Set this to `false` to disable the use of encryption. See [above](#connection-encryption). | | `true` | +| `trust_cert` | Set this to `true` to trust the server certificate. See [above](#connection-encryption). | | `false` | +| `retries` | The number of times to retry a failed connection. | | `1` | +| `schema_authorization` | Optionally set this to the principal who should own the schemas created by dbt. [Details above](#schema-authorization). | | | +| `login_timeout` | The amount of seconds to wait until a response from the server is received when establishing a connection. `0` means that the timeout is disabled. | | `0` | +| `query_timeout` | The amount of seconds to wait until a response from the server is received when executing a query. `0` means that the timeout is disabled. | | `0` | Valid values for `authentication`: diff --git a/website/docs/reference/warehouse-setups/mysql-setup.md b/website/docs/docs/core/connect-data-platform/mysql-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/mysql-setup.md rename to website/docs/docs/core/connect-data-platform/mysql-setup.md index c91e48c30ec..1df6e205272 100644 --- a/website/docs/reference/warehouse-setups/mysql-setup.md +++ b/website/docs/docs/core/connect-data-platform/mysql-setup.md @@ -1,5 +1,6 @@ --- title: "MySQL setup" +description: "Read this guide to learn about the MySQL warehouse setup in dbt." id: "mysql-setup" meta: maintained_by: Community @@ -12,7 +13,7 @@ meta: slack_channel_name: '#db-mysql-family' slack_channel_link: 'https://getdbt.slack.com/archives/C03BK0SHC64' platform_name: 'MySQL' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Community plugin diff --git a/website/docs/reference/warehouse-setups/oracle-setup.md b/website/docs/docs/core/connect-data-platform/oracle-setup.md similarity index 73% rename from website/docs/reference/warehouse-setups/oracle-setup.md rename to website/docs/docs/core/connect-data-platform/oracle-setup.md index dfd9ad07207..b1195fbd0a0 100644 --- a/website/docs/reference/warehouse-setups/oracle-setup.md +++ b/website/docs/docs/core/connect-data-platform/oracle-setup.md @@ -1,5 +1,6 @@ --- title: "Oracle setup" +description: "Read this guide to learn about the Oracle warehouse setup in dbt." id: "oracle-setup" meta: maintained_by: Oracle @@ -8,11 +9,11 @@ meta: pypi_package: 'dbt-oracle' min_core_version: 'v1.2.1' cloud_support: Not Supported - min_supported_version: 'SQlite Version 3.0' + min_supported_version: 'Oracle 12c and higher' slack_channel_name: '#db-oracle' slack_channel_link: 'https://getdbt.slack.com/archives/C01PWH4TXLY' platform_name: 'Oracle' - config_page: 'no-configs' + config_page: '/reference/resource-configs/oracle-configs' ---

    Overview of {frontMatter.meta.pypi_package}

    @@ -48,17 +49,17 @@ pip is the easiest way to install the adapter: ### Configure the Python driver mode -:::info +:::info [python-oracledb](https://oracle.github.io/python-oracledb/) is the renamed, major release of Oracle's popular cx_Oracle interface ::: -[python-oracledb](https://oracle.github.io/python-oracledb/) makes it optional to install the Oracle Client libraries. +[python-oracledb](https://oracle.github.io/python-oracledb/) makes it optional to install the Oracle Client libraries. This driver supports 2 modes 1. **Thin mode (preferred) ** : Python process directly connects to the Oracle database. This mode does not need the Oracle Client libraries 2. **Thick mode** : Python process links with the Oracle Client libraries. Some advanced Oracle database functionalities (for e.g. Advanced Queuing and Scrollable cursors) are currently available via Oracle Client libraries -It is highly recommended to use the **thin** mode as it vastly simplifies installation. You can configure the driver mode using the environment variable `ORA_PYTHON_DRIVER_TYPE` +You can configure the driver mode using the environment variable `ORA_PYTHON_DRIVER_TYPE`. Use the **thin** mode as it vastly simplifies installation. | Driver Mode | Oracle Client libraries required? | Configuration | |------------------------|-----------------------------------| ------------- | @@ -66,7 +67,11 @@ It is highly recommended to use the **thin** mode as it vastly simplifies instal | Thick | Yes | `ORA_PYTHON_DRIVER_TYPE=thick` | | cx_oracle (old driver) | Yes | `ORA_PYTHON_DRIVER_TYPE=cx` | -The default value of `ORA_PYTHON_DRIVER_TYPE` is `cx`. This might change in the future as more users migrate towards the new python driver. +The default value of `ORA_PYTHON_DRIVER_TYPE` is `cx` + +:::warning Deprecation Warning +Default value of `ORA_PYTHON_DRIVER_TYPE` will change to `thin` in future release of dbt-oracle because `cx_oracle` is deprecated +::: /etc/ld.so.conf.d/oracle-instantclient.conf" sudo ldconfig @@ -148,7 +153,7 @@ Oracle client libraries versions 21, 19, 18, 12, and 11.2 are supported where av :::info Windows 7 users Note that Oracle Client versions 21c and 19c are not supported on Windows 7. -::: +::: 2. Unzip the package into a directory that is accessible to your application. For example unzip `instantclient-basic-windows.x64-19.11.0.0.0dbru.zip` to `C:\oracle\instantclient_19_11`. @@ -160,7 +165,7 @@ Note that Oracle Client versions 21c and 19c are not supported on Windows 7. 5. For Instant Client 11.2 install [VS 2005 64-bit](https://docs.microsoft.com/en-US/cpp/windows/latest-supported-vc-redist?view=msvc-170#visual-studio-2005-vc-80-sp1-no-longer-supported) 4. Add the Oracle Instant Client directory to the `PATH` environment variable.The directory must occur in `PATH` before any other Oracle directories. Restart any open command prompt windows. - + ```bash SET PATH=C:\oracle\instantclient_19_9;%PATH% ``` @@ -169,36 +174,36 @@ Note that Oracle Client versions 21c and 19c are not supported on Windows 7. -1. Download the instant client DMG package - +1. Download the instant client DMG package + ```bash cd $HOME/Downloads curl -O https://download.oracle.com/otn_software/mac/instantclient/198000/instantclient-basic-macos.x64-19.8.0.0.0dbru.dmg ``` 2. Mount the instant client DMG package - + ```bash hdiutil mount instantclient-basic-macos.x64-19.8.0.0.0dbru.dmg ``` 3. Run the install script in the mounted package - + ```bash /Volumes/instantclient-basic-macos.x64-19.8.0.0.0dbru/install_ic.sh ``` 4. Unmount the package - + ```bash hdiutil unmount /Volumes/instantclient-basic-macos.x64-19.8.0.0.0dbru ``` - + 5. The Instant Client directory will be `$HOME/Downloads/instantclient_19_8`. You could move it to some place convenient. 6. Add links to `~/lib` or `/usr/local/lib` to enable dbt to find the libraries. - + ```bash mkdir ~/lib ln -s ~/instantclient_19_8/libclntsh.dylib ~/lib/ @@ -211,9 +216,9 @@ Note that Oracle Client versions 21c and 19c are not supported on Windows 7. -## Configure wallet for Oracle Autonomous Database in Cloud +## Configure wallet for Oracle Autonomous Database (ADB-S) in Cloud -dbt can connect to Oracle Autonomous Database (ADB) in Oracle Cloud using either TLS (Transport Layer Security) or mutual TLS (mTLS). TLS and mTLS provide enhanced security for authentication and encryption. +dbt can connect to Oracle Autonomous Database (ADB-S) in Oracle Cloud using either TLS (Transport Layer Security) or mutual TLS (mTLS). TLS and mTLS provide enhanced security for authentication and encryption. A database username and password is still required for dbt connections which can be configured as explained in the next section [Connecting to Oracle Database](#connecting-to-oracle-database). -With TLS, dbt can connect to Oracle ADB without using a wallet. Both Thin and Thick modes of the python-oracledb driver support TLS. +With TLS, dbt can connect to Oracle ADB without using a wallet. Both Thin and Thick modes of the python-oracledb driver support TLS. :::info In Thick mode, dbt can connect through TLS only when using Oracle Client library versions 19.14 (or later) or 21.5 (or later). @@ -236,11 +241,11 @@ Refer to Oracle documentation to [connect to an ADB instance using TLS authentic -For mutual TLS connections, a wallet needs be downloaded from the OCI console and the python driver needs to be configured to use it. +For mutual TLS connections, a wallet needs be downloaded from the OCI console and the python driver needs to be configured to use it. #### Install the Wallet and Network Configuration Files -From the Oracle Cloud console for the database, download the wallet zip file using the `DB Connection` button. The zip contains the wallet and network configuration files. +From the Oracle Cloud console for the database, download the wallet zip file using the `DB Connection` button. The zip contains the wallet and network configuration files. :::warning Note Keep wallet files in a secure location and share them only with authorized users. @@ -280,7 +285,7 @@ In Thick mode, the following files from the zip are needed: - `sqlnet.ora` - Configures Oracle Network settings - `cwallet.sso` - Enables SSL/TLS connections -After unzipping the files in a secure directory, set the **TNS_ADMIN** environment variable to that directory name. +After unzipping the files in a secure directory, set the **TNS_ADMIN** environment variable to that directory name. ```bash export TNS_ADMIN=/path/to/directory_containing_tnsnames.ora @@ -320,7 +325,7 @@ Starting with `dbt-oracle==1.0.2`, it is **optional** to set the database name export DBT_ORACLE_DATABASE=example_db2022adb ``` -If database name is not set, adapter will retrieve it using the following query. +If database name is not set, adapter will retrieve it using the following query. ```sql SELECT SYS_CONTEXT('userenv', 'DB_NAME') FROM DUAL @@ -344,7 +349,7 @@ The directory location of `tnsnames.ora` file can be specified using `TNS_ADMIN` ```text -db2022adb_high = (description = +db2022adb_high = (description = (retry_count=20)(retry_delay=3) (address=(protocol=tcps) (port=1522) @@ -450,31 +455,161 @@ dbt_test: - + + + + + + + +## Python Models using Oracle Autonomous Database (ADB-S) + +Oracle's Autonomous Database Serverless (ADB-S) users can run dbt-py models using Oracle Machine Learning (OML4PY) which is available without any extra setup required. + +### Features +- User Defined Python function is run in an ADB-S spawned Python 3.10 runtime +- Import [3rd party Python packages](https://docs.oracle.com/en/database/oracle/machine-learning/oml-notebooks/omlug/oml4py-notebook.html#GUID-78225241-CD6B-4588-AD4B-799079FA1784) installed in the default Python runtime +- Access to Database session in the Python function +- DataFrame read API to read `TABLES`, `VIEWS` and ad-hoc `SELECT` queries as DataFrames +- DataFrame write API to write DataFrames as `TABLES` +- Supports both table and incremental materialization +- Integration with conda (Coming Soon) + +### Setup + +#### Required roles + +- User must be non-ADMIN to execute the Python function +- User must be granted the `OML_DEVELOPER` role + +#### OML Cloud Service URL + +OML Cloud Service URL is of the following format +```text +https://tenant1-dbt.adb.us-sanjose-1.oraclecloudapps.com +``` +In this example, + - `tenant1` is the tenancy ID + - `dbt` is the database name + - `us-sanjose-1` is the datacenter region + - `oraclecloudapps.com` is the root domain + +Add `oml_cloud_service_url` to your existing `~/.dbt/profiles.yml` + ```yaml dbt_test: - target: "{{ env_var('DBT_TARGET', 'dev') }}" + target: dev outputs: dev: type: oracle user: "{{ env_var('DBT_ORACLE_USER') }}" pass: "{{ env_var('DBT_ORACLE_PASSWORD') }}" - protocol: "tcps" - host: "{{ env_var('DBT_ORACLE_HOST') }}" - port: 1522 - service: "{{ env_var('DBT_ORACLE_SERVICE') }}" - database: "{{ env_var('DBT_ORACLE_DATABASE') }}" + tns_name: "{{ env_var('DBT_ORACLE_TNS_NAME') }}" schema: "{{ env_var('DBT_ORACLE_SCHEMA') }}" - threads: 4 + oml_cloud_service_url: "https://tenant1-dbt.adb.us-sanjose-1.oraclecloudapps.com" ``` - - +### Python model configurations -
    +| Configuration | Datatype | Examples | +|--|--------|-----------------------------------------------------------------------------------------------| +| Materialization | String | `dbt.config(materialized="incremental")` or `dbt.config(materialized="table")` | +| Service | String | `dbt.config(service="HIGH")` or `dbt.config(service="MEDIUM")` or `dbt.config(service="LOW")` | +| Async Mode | Boolean | `dbt.config(async_flag=True)` +| Timeout in seconds only to be used with **_async_** mode (`min: 1800` and `max: 43200`) | Integer | `dbt.config(timeout=1800)` | + +In async mode, dbt-oracle will schedule a Python job, poll the job's status and wait for it to complete. +Without async mode, dbt-oracle will immediately invoke the Python job in a blocking manner. Use async mode for long-running Python jobs. + +### Python model examples + +#### Refer other model + +Use `dbt.ref(model_name)` to refer either SQL or Python model + +```python +def model(dbt, session): + # Must be either table or incremental (view is not currently supported) + dbt.config(materialized="table") + # returns oml.core.DataFrame referring a dbt model + s_df = dbt.ref("sales_cost") + return s_df +``` + +#### Refer a source + +Use `dbt.source(source_schema, table_name)` + +```python +def model(dbt, session): + # Must be either table or incremental (view is not currently supported) + dbt.config(materialized="table") + # oml.core.DataFrame representing a datasource + s_df = dbt.source("sh_database", "channels") + return s_df + +``` + +#### Incremental materialization + +```python +def model(dbt, session): + # Must be either table or incremental + dbt.config(materialized="incremental") + # oml.DataFrame representing a datasource + sales_cost_df = dbt.ref("sales_cost") + + if dbt.is_incremental: + cr = session.cursor() + result = cr.execute(f"select max(cost_timestamp) from {dbt.this.identifier}") + max_timestamp = result.fetchone()[0] + # filter new rows + sales_cost_df = sales_cost_df[sales_cost_df["COST_TIMESTAMP"] > max_timestamp] + + return sales_cost_df +``` + +#### Concatenate a new column in Dataframe + +```python + +def model(dbt, session): + dbt.config(materialized="table") + dbt.config(async_flag=True) + dbt.config(timeout=1800) + + sql = f"""SELECT customer.cust_first_name, + customer.cust_last_name, + customer.cust_gender, + customer.cust_marital_status, + customer.cust_street_address, + customer.cust_email, + customer.cust_credit_limit, + customer.cust_income_level + FROM sh.customers customer, sh.countries country + WHERE country.country_iso_code = ''US'' + AND customer.country_id = country.country_id""" + + # session.sync(query) will run the sql query and returns a oml.core.DataFrame + us_potential_customers = session.sync(query=sql) + + # Compute an ad-hoc anomaly score on the credit limit + median_credit_limit = us_potential_customers["CUST_CREDIT_LIMIT"].median() + mean_credit_limit = us_potential_customers["CUST_CREDIT_LIMIT"].mean() + anomaly_score = (us_potential_customers["CUST_CREDIT_LIMIT"] - median_credit_limit)/(median_credit_limit - mean_credit_limit) + + # Add a new column "CUST_CREDIT_ANOMALY_SCORE" + us_potential_customers = us_potential_customers.concat({"CUST_CREDIT_ANOMALY_SCORE": anomaly_score.round(3)}) + + # Return potential customers dataset as a oml.core.DataFrame + return us_potential_customers + +``` + + ## Supported Features @@ -491,8 +626,8 @@ dbt_test: - Exposures - Document generation - Serve project documentation as a website +- Python Models (from dbt-oracle version 1.5.1) - All dbt commands are supported ## Not Supported features - Ephemeral materialization - diff --git a/website/docs/reference/warehouse-setups/postgres-setup.md b/website/docs/docs/core/connect-data-platform/postgres-setup.md similarity index 73% rename from website/docs/reference/warehouse-setups/postgres-setup.md rename to website/docs/docs/core/connect-data-platform/postgres-setup.md index 0955f731974..f56d3f22576 100644 --- a/website/docs/reference/warehouse-setups/postgres-setup.md +++ b/website/docs/docs/core/connect-data-platform/postgres-setup.md @@ -1,5 +1,6 @@ --- title: "Postgres setup" +description: "Read this guide to learn about the Postgres warehouse setup in dbt." id: "postgres-setup" meta: maintained_by: dbt Labs @@ -12,9 +13,11 @@ meta: slack_channel_name: '#db-postgres' slack_channel_link: 'https://getdbt.slack.com/archives/C0172G2E273' platform_name: 'Postgres' - config_page: 'postgres-configs' + config_page: '/reference/resource-configs/postgres-configs' --- + +

    Overview of {frontMatter.meta.pypi_package}

      @@ -60,16 +63,19 @@ company-name: user: [username] password: [password] port: [port] - dbname: [database name] + dbname: [database name] # or database instead of dbname schema: [dbt schema] - threads: [1 or more] + threads: [optional, 1 or more] [keepalives_idle](#keepalives_idle): 0 # default 0, indicating the system default. See below connect_timeout: 10 # default 10 seconds [retries](#retries): 1 # default 1 retry on error/timeout when opening connections [search_path](#search_path): [optional, override the default postgres search_path] [role](#role): [optional, set the role dbt assumes when executing queries] [sslmode](#sslmode): [optional, set the sslmode used to connect to the database] - + [sslcert](#sslcert): [optional, set the sslcert to control the certifcate file location] + [sslkey](#sslkey): [optional, set the sslkey to control the location of the private key] + [sslrootcert](#sslrootcert): [optional, set the sslrootcert config value to a new file path in order to customize the file location that contain root certificates] + ``` @@ -82,16 +88,25 @@ The `search_path` config controls the Postgres "search path" that dbt configures #### role - Added in v0.16.0 - The `role` config controls the Postgres role that dbt assumes when opening new connections to the database. #### sslmode - Added in v0.16.0 - The `sslmode` config controls how dbt connectes to Postgres databases using SSL. See [the Postgres docs](https://www.postgresql.org/docs/9.1/libpq-ssl.html) on `sslmode` for usage information. When unset, dbt will connect to databases using the Postgres default, `prefer`, as the `sslmode`. + +#### sslcert + +The `sslcert` config controls the location of the certificate file used to connect to Postgres when using client SSL connections. To use a certificate file that is not in the default location, set that file path using this value. Without this config set, dbt uses the Postgres default locations. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default paths. + +#### sslkey + +The `sslkey` config controls the location of the private key for connecting to Postgres using client SSL connections. If this config is omitted, dbt uses the default key location for Postgres. See [Client Certificates](https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-CLIENTCERT) in the Postgres SSL docs for the default locations. + +#### sslrootcert + +When connecting to a Postgres server using a client SSL connection, dbt verifies that the server provides an SSL certificate signed by a trusted root certificate. These root certificates are in the `~/.postgresql/root.crt` file by default. To customize the location of this file, set the `sslrootcert` config value to a new file path. + ### `keepalives_idle` If the database closes its connection while dbt is waiting for data, you may see the error `SSL SYSCALL error: EOF detected`. Lowering the [`keepalives_idle` value](https://www.postgresql.org/docs/9.3/libpq-connect.html) may prevent this, because the server will send a ping to keep the connection active more frequently. diff --git a/website/docs/docs/core/connect-data-platform/profiles.yml.md b/website/docs/docs/core/connect-data-platform/profiles.yml.md new file mode 100644 index 00000000000..97254dda1c4 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/profiles.yml.md @@ -0,0 +1,58 @@ +--- +title: "About profiles.yml" +id: profiles.yml +--- + +If you're using [dbt Core](/docs/core/about-dbt-core), you'll need a `profiles.yml` file that contains the connection details for your data platform. When you run dbt Core from the command line, it reads your `dbt_project.yml` file to find the `profile` name, and then looks for a profile with the same name in your `profiles.yml` file. This profile contains all the information dbt needs to connect to your data platform. + +For detailed info, you can refer to the [Connection profiles](/docs/core/connect-data-platform/connection-profiles). + +:::tip No `profiles.yml` file needed for dbt Cloud +If you're using dbt Cloud, you can [connect to your data platform](/docs/cloud/connect-data-platform/about-connections) directly in the dbt Cloud interface and don't need a `profiles.yml` file. +::: + + +This section identifies the parts of your `profiles.yml` that aren't specific to a particular data platform. For specific connection details, refer to the relevant page for your data platform. + + + +```yml +[config](/reference/global-configs/about-global-configs): + [send_anonymous_usage_stats](/reference/global-configs/usage-stats): + [use_colors](/reference/global-configs/print-output#print-color): + [partial_parse](/reference/global-configs/parsing): + [printer_width](/reference/global-configs/print-output#printer-width): + [write_json](/reference/global-configs/json-artifacts): + [warn_error](/reference/global-configs/warnings): + [warn_error_options](/reference/global-configs/warnings): ] | include: all, exclude: []> + [log_format](/reference/global-configs/logs): + [debug](/reference/global-configs/logs#log-level): + [version_check](/reference/global-configs/version-compatibility): + [fail_fast](/reference/global-configs/failing-fast): + [use_experimental_parser](/reference/global-configs/parsing): + [static_parser](/reference/global-configs/parsing): + +: + target: # this is the default target + outputs: + : + type: + schema: + threads: + + ### database-specific connection details + ... + + : # additional targets + ... + +: # additional profiles + ... + +``` + + + +## User config + +You can set default values of global configs for all projects that you run using your local machine. See the docs on [global configs](/reference/global-configs/about-global-configs) for details. diff --git a/website/docs/docs/core/connect-data-platform/redshift-setup.md b/website/docs/docs/core/connect-data-platform/redshift-setup.md new file mode 100644 index 00000000000..175d5f6a715 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/redshift-setup.md @@ -0,0 +1,276 @@ +--- +title: "Redshift setup" +description: "Read this guide to learn about the Redshift warehouse setup in dbt." +id: "redshift-setup" +meta: + maintained_by: dbt Labs + authors: 'core dbt maintainers' + github_repo: 'dbt-labs/dbt-redshift' + pypi_package: 'dbt-redshift' + min_core_version: 'v0.10.0' + cloud_support: Supported + min_supported_version: 'n/a' + slack_channel_name: '#db-redshift' + slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' + platform_name: 'Redshift' + config_page: '/reference/resource-configs/redshift-configs' +--- + + + +

      Overview of {frontMatter.meta.pypi_package}

      + +
        +
      • Maintained by: {frontMatter.meta.maintained_by}
      • +
      • Authors: {frontMatter.meta.authors}
      • +
      • GitHub repo: {frontMatter.meta.github_repo}
      • +
      • PyPI package: {frontMatter.meta.pypi_package}
      • +
      • Slack channel: {frontMatter.meta.slack_channel_name}
      • +
      • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
      • +
      • dbt Cloud support: {frontMatter.meta.cloud_support}
      • +
      • Minimum data platform version: {frontMatter.meta.min_supported_version}
      • +
      + + +

      Installing {frontMatter.meta.pypi_package}

      + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

      Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

      + +

      Configuring {frontMatter.meta.pypi_package}

      + +

      For {frontMatter.meta.platform_name}-specific configuration, refer to {frontMatter.meta.platform_name} Configuration.

      + +

      For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}.

      + +## Configurations + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `type` | redshift | The type of data warehouse you are connecting to| +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `port` | 5439 | | +| `dbname` | my_db | Database name| +| `schema` | my_schema | Schema name| +| `connect_timeout` | `None` or 30 | Number of seconds before connection times out| +| `sslmode` | prefer | optional, set the sslmode to connect to the database. Default prefer, which will use 'verify-ca' to connect. For more information on `sslmode`, see Redshift note below| +| `role` | None | Optional, user identifier of the current session| +| `autocreate` | false | Optional, default false. Creates user if they do not exist | +| `db_groups` | ['ANALYSTS'] | Optional. A list of existing database group names that the DbUser joins for the current session | +| `ra3_node` | true | Optional, default False. Enables cross-database sources| +| `autocommit` | true | Optional, default True. Enables autocommit after each statement| +| `retries` | 1 | Number of retries | + + +## Authentication Parameters + +The authentication methods that dbt Core supports are: + +- `database` — Password-based authentication (default, will be used if `method` is not provided) +- `IAM` — IAM + + +Click on one of these authentication methods for further details on how to configure your connection profile. Each tab also includes an example `profiles.yml` configuration file for you to review. + + + + + +The following table contains the parameters for the database (password-based) connection method. + + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` | database| Leave this parameter unconfigured, or set this to database | +| `host` | hostname.region.redshift.amazonaws.com| Host of cluster | +| `user` | username | Account username to log into your cluster | +| `password` | password1 | Password for authentication | + +
      + +#### Example profiles.yml for database authentication + + + +```yaml +company-name: + target: dev + outputs: + dev: + type: redshift + host: hostname.region.redshift.amazonaws.com + user: username + password: password1 + dbname: analytics + schema: analytics + port: 5439 + + # Optional Redshift configs: + sslmode: prefer + role: None + ra3_node: true + autocommit: true + threads: 4 + connect_timeout: None + +``` + + + +
      + + + +The following table lists the authentication parameters to use IAM authentication. + +To set up a Redshift profile using IAM Authentication, set the `method` parameter to `iam` as shown below. Note that a password is not required when using IAM Authentication. For more information on this type of authentication, +consult the [Redshift Documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/generating-user-credentials.html) +and [boto3 +docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.get_cluster_credentials) +on generating user credentials with IAM Auth. + +If you receive the "You must specify a region" error when using IAM +Authentication, then your aws credentials are likely misconfigured. Try running +`aws configure` to set up AWS access keys, and pick a default region. If you have any questions, +please refer to the official AWS documentation on [Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). + + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` |IAM| use IAM to authenticate | +| `iam_profile` | analyst | dbt will use the specified profile from your ~/.aws/config file | +| `cluster_id` | CLUSTER_ID| Required for IAM | +| `user` | username | Account user to log into your cluster | +| `region` | us-east-1 | Required for IAM authentication | + + +
      + + +#### Example profiles.yml for IAM + + + +```yaml + my-redshift-db: + target: dev + outputs: + dev: + type: redshift + method: iam + cluster_id: CLUSTER_ID + host: hostname.region.redshift.amazonaws.com + user: alice + iam_profile: analyst + dbname: analytics + schema: analytics + port: 5439 + + # Optional Redshift configs: + threads: 4 + connect_timeout: None + [retries](#retries): 1 + role: None + sslmode: prefer + ra3_node: true + autocommit: true + region: us-east-1 + autocreate: true + db_groups: ['ANALYSTS'] + +``` + + + +
      + +
      + + +### Specifying an IAM Profile + +When the `iam_profile` configuration is set, dbt will use the specified profile from your `~/.aws/config` file instead of using the profile name `default` + +## Redshift notes + +### `sslmode` change +Before to dbt-redshift 1.5, `psycopg2` was used as the driver. `psycopg2` accepts `disable`, `prefer`, `allow`, `require`, `verify-ca`, `verify-full` as valid inputs of `sslmode`, and does not have an `ssl` parameter, as indicated in PostgreSQL [doc](https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING:~:text=%2Dencrypted%20connection.-,sslmode,-This%20option%20determines). + +In dbt-redshift 1.5, we switched to using `redshift_connector`, which accepts `verify-ca`, and `verify-full` as valid `sslmode` inputs, and has a `ssl` parameter of `True` or `False`, according to redshift [doc](https://docs.aws.amazon.com/redshift/latest/mgmt/python-configuration-options.html#:~:text=parameter%20is%20optional.-,sslmode,-Default%20value%20%E2%80%93%20verify). + +For backward compatibility, dbt-redshift now supports valid inputs for `sslmode` in `psycopg2`. We've added conversion logic mapping each of `psycopg2`'s accepted `sslmode` values to the corresponding `ssl` and `sslmode` parameters in `redshift_connector`. + +The table below details accepted `sslmode` parameters and how the connection will be made according to each option: + +`sslmode` parameter | Expected behavior in dbt-redshift | Actions behind the scenes +-- | -- | -- +disable | Connection will be made without using ssl | Set `ssl` = False +allow | Connection will be made using verify-ca | Set `ssl` = True & `sslmode` = verify-ca +prefer | Connection will be made using verify-ca | Set `ssl` = True & `sslmode` = verify-ca +require | Connection will be made using verify-ca | Set `ssl` = True & `sslmode` = verify-ca +verify-ca | Connection will be made using verify-ca | Set `ssl` = True & `sslmode` = verify-ca +verify-full | Connection will be made using verify-full | Set `ssl` = True & `sslmode` = verify-full + +When a connection is made using `verify-ca`, will look for the CA certificate in `~/redshift-ca-bundle.crt`. + +For more details on sslmode changes, our design choices, and reasoning — please refer to the [PR pertaining to this change](https://github.com/dbt-labs/dbt-redshift/pull/439). + +### `autocommit` parameter + +The[ autocommit mode](https://www.psycopg.org/docs/connection.html#connection.autocommit) is useful to execute commands that run outside a transaction. Connection objects used in Python must have `autocommit = True` to run operations such as `CREATE DATABASE`, and `VACUUM`. `autocommit` is off by default in `redshift_connector`, but we've changed this default to `True` to ensure certain macros run successfully in your dbt project. + +If desired, you can define a separate target with `autocommit=True` as such: + + + +```yaml +profile-to-my-RS-target: + target: dev + outputs: + dev: + type: redshift + ... + autocommit: False + + + profile-to-my-RS-target-with-autocommit-enabled: + target: dev + outputs: + dev: + type: redshift + ... + autocommit: True + ``` + + +To run certain macros with autocommit, load the profile with autocommit using the `--profile` flag. For more context, please refer to this [PR](https://github.com/dbt-labs/dbt-redshift/pull/475/files). + + +### Deprecated `profile` parameters in 1.5 + +- `iam_duration_seconds` + +- `keepalives_idle` + +### `sort` and `dist` keys + +Where possible, dbt enables the use of `sort` and `dist` keys. See the section on [Redshift specific configurations](/reference/resource-configs/redshift-configs). + + + + + +#### retries + +If `dbt-redshift` encounters an operational error or timeout when opening a new connection, it will retry up to the number of times configured by `retries`. If set to 2+ retries, dbt will wait 1 second before retrying. The default value is 1 retry. If set to 0, dbt will not retry at all. + + diff --git a/website/docs/reference/warehouse-setups/rockset-setup.md b/website/docs/docs/core/connect-data-platform/rockset-setup.md similarity index 96% rename from website/docs/reference/warehouse-setups/rockset-setup.md rename to website/docs/docs/core/connect-data-platform/rockset-setup.md index c80b28a5f68..4a146829a03 100644 --- a/website/docs/reference/warehouse-setups/rockset-setup.md +++ b/website/docs/docs/core/connect-data-platform/rockset-setup.md @@ -1,5 +1,6 @@ --- title: "Rockset setup" +description: "Read this guide to learn about the Rockset warehouse setup in dbt." id: "rockset-setup" meta: maintained_by: Rockset, Inc. @@ -12,7 +13,7 @@ meta: slack_channel_name: '#dbt-rockset' slack_channel_link: 'https://getdbt.slack.com/archives/C02J7AZUAMN' platform_name: 'Rockset' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Vendor-supported plugin diff --git a/website/docs/reference/warehouse-setups/singlestore-setup.md b/website/docs/docs/core/connect-data-platform/singlestore-setup.md similarity index 97% rename from website/docs/reference/warehouse-setups/singlestore-setup.md rename to website/docs/docs/core/connect-data-platform/singlestore-setup.md index d0b88c1f0ac..a63466542a9 100644 --- a/website/docs/reference/warehouse-setups/singlestore-setup.md +++ b/website/docs/docs/core/connect-data-platform/singlestore-setup.md @@ -1,5 +1,6 @@ --- title: "SingleStore setup" +description: "Read this guide to learn about the SingleStore warehouse setup in dbt." id: "singlestore-setup" meta: maintained_by: SingleStore, Inc. @@ -7,12 +8,12 @@ meta: github_repo: 'memsql/dbt-singlestore' pypi_package: 'dbt-singlestore' min_core_version: 'v1.0.0' - cloud_support: Supported + cloud_support: Not supported min_supported_version: 'v7.5' slack_channel_name: 'db-singlestore' slack_channel_link: 'https://getdbt.slack.com/archives/C02V2QHFF7U' platform_name: 'SingleStore' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Vendor-supported plugin diff --git a/website/docs/reference/warehouse-setups/snowflake-setup.md b/website/docs/docs/core/connect-data-platform/snowflake-setup.md similarity index 71% rename from website/docs/reference/warehouse-setups/snowflake-setup.md rename to website/docs/docs/core/connect-data-platform/snowflake-setup.md index a22f679e208..98bcf447fed 100644 --- a/website/docs/reference/warehouse-setups/snowflake-setup.md +++ b/website/docs/docs/core/connect-data-platform/snowflake-setup.md @@ -1,5 +1,6 @@ --- title: "Snowflake setup" +description: "Read this guide to learn about the Snowflake warehouse setup in dbt." id: "snowflake-setup" meta: maintained_by: dbt Labs @@ -12,9 +13,11 @@ meta: slack_channel_name: '#db-snowflake' slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' platform_name: 'Snowflake' - config_page: 'snowflake-configs' + config_page: '/reference/resource-configs/snowflake-configs' --- + +

      Overview of {frontMatter.meta.pypi_package}

        @@ -75,9 +78,10 @@ my-snowflake-db: # optional connect_retries: 0 # default 0 connect_timeout: 10 # default: 10 - retry_on_database_errors: False # default: false + retry_on_database_errors: False # default: false retry_all: False # default: false -``` + reuse_connections: False # default: false (available v1.4+) + ``` @@ -109,8 +113,9 @@ my-snowflake-db: # optional connect_retries: 0 # default 0 connect_timeout: 10 # default: 10 - retry_on_database_errors: False # default: false + retry_on_database_errors: False # default: false retry_all: False # default: false + reuse_connections: False # default: false (available v1.4+) ``` Along with adding the `authenticator` parameter, be sure to run `alter account set allow_client_mfa_caching = true;` in your Snowflake warehouse. Together, these will allow you to easily verify authenatication with the DUO Mobile app (skipping this results in push notifications for every model built on every `dbt run`). @@ -119,6 +124,9 @@ Along with adding the `authenticator` parameter, be sure to run `alter account s To use key pair authentication, omit a `password` and instead provide a `private_key_path` and, optionally, a `private_key_passphrase` in your target. **Note:** Versions of dbt before 0.16.0 required that private keys were encrypted and a `private_key_passphrase` was provided. This behavior was changed in dbt v0.16.0. +Starting from [dbt v1.5.0](/docs/dbt-versions/core), you have the option to use a `private_key` string instead of a `private_key_path`. The `private_key` string should be in either Base64-encoded DER format, representing the key bytes, or a plain-text PEM format. Refer to [Snowflake documentation](https://docs.snowflake.com/developer-guide/python-connector/python-connector-example#using-key-pair-authentication-key-pair-rotation) for more info on how they generate the key. + + ```yaml @@ -133,6 +141,7 @@ my-snowflake-db: # Keypair config private_key_path: [path/to/private.key] + # or private_key instead of private_key_path private_key_passphrase: [passphrase for the private key, if key is encrypted] database: [database name] @@ -145,19 +154,22 @@ my-snowflake-db: # optional connect_retries: 0 # default 0 connect_timeout: 10 # default: 10 - retry_on_database_errors: False # default: false - retry_all: False # default: false + retry_on_database_errors: False # default: false + retry_all: False # default: false + reuse_connections: False # default: false ``` ### SSO Authentication -To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. `authenticator` can be one of 'externalbrowser' or a valid Okta URL. +To use SSO authentication for Snowflake, omit a `password` and instead supply an `authenticator` config to your target. +`authenticator` can be one of 'externalbrowser' or a valid Okta URL. -New in v0.18.0 +Refer to the following tabs for more info and examples: -**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). See [the Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for how to enable this feature in your account. + + @@ -167,15 +179,53 @@ my-snowflake-db: outputs: dev: type: snowflake - account: [account id] - user: [username] - role: [user role] + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role # SSO config authenticator: externalbrowser - database: [database name] - warehouse: [warehouse name] + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name + schema: [dbt schema] + threads: [between 1 and 8] + client_session_keep_alive: False + query_tag: [anything] + + # optional + connect_retries: 0 # default 0 + connect_timeout: 10 # default: 10 + retry_on_database_errors: False # default: false + retry_all: False # default: false + reuse_connections: False # default: false +``` + + + + + + + + + +```yaml +my-snowflake-db: + target: dev + outputs: + dev: + type: snowflake + account: [account id] # Snowflake + user: [username] # Snowflake username + role: [user role] # Snowflake user role + + # SSO config -- The three following fields are REQUIRED + authenticator: [Okta account URL] + username: [Okta username] + password: [Okta password] + + database: [database name] # Snowflake database name + warehouse: [warehouse name] # Snowflake warehouse name schema: [dbt schema] threads: [between 1 and 8] client_session_keep_alive: False @@ -184,12 +234,19 @@ my-snowflake-db: # optional connect_retries: 0 # default 0 connect_timeout: 10 # default: 10 - retry_on_database_errors: False # default: false - retry_all: False # default: false + retry_on_database_errors: False # default: false + retry_all: False # default: false + reuse_connections: False # default: false ``` + + + +**Note**: By default, every connection that dbt opens will require you to re-authenticate in a browser. The Snowflake connector package supports caching your session token, but it [currently only supports Windows and Mac OS](https://docs.snowflake.com/en/user-guide/admin-security-fed-auth-use.html#optional-using-connection-caching-to-minimize-the-number-of-prompts-for-authentication). + +Refer to the [Snowflake docs](https://docs.snowflake.com/en/sql-reference/parameters.html#label-allow-id-token) for info on how to enable this feature in your account. ## Configurations @@ -212,9 +269,10 @@ The "base" configs for Snowflake targets are shown below. Note that you should a | retry_on_database_errors | No | A boolean flag indicating whether to retry after encountering errors of type [snowflake.connector.errors.DatabaseError](https://github.com/snowflakedb/snowflake-connector-python/blob/ffdd6b3339aa71885878d047141fe9a77c4a4ae3/src/snowflake/connector/errors.py#L361-L364) | | connect_retries | No | The number of times to retry after an unsuccessful connection | | connect_timeout | No | The number of seconds to sleep between failed connection retries | +| reuse_connections | No | A boolean flag indicating whether to reuse idle connections to help reduce total connections opened. Default is `False`. | ### account -For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documention: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)" and "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". +For AWS accounts in the US West default region, you can use `abc123` (without any other segments). For some AWS accounts you will have to append the region and/or cloud platform. For example, `abc123.eu-west-1` or `abc123.eu-west-2.aws`. For GCP and Azure-based accounts, you have to append the region and cloud platform, such as `gcp` or `azure`, respectively. For example, `abc123.us-central1.gcp`. For details, see Snowflake's documentation: "[Specifying Region Information in Your Account Hostname](https://docs.snowflake.com/en/user-guide/intro-regions.html#specifying-region-information-in-your-account-hostname)". Please also note that the Snowflake account name should only be the without the prefixed . Relevant documentation: "[Account Identifier Formats by Cloud Platform and Region](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html#account-identifier-formats-by-cloud-platform-and-region)". ### client_session_keep_alive @@ -223,20 +281,21 @@ The `client_session_keep_alive` feature is intended to keep Snowflake sessions a ### query_tag -New in v0.18.0 - [Query tags](https://docs.snowflake.com/en/sql-reference/parameters.html#query-tag) are a Snowflake parameter that can be quite useful later on when searching in the [QUERY_HISTORY view](https://docs.snowflake.com/en/sql-reference/account-usage/query_history.html). + -### retry_on_database_errors +### reuse_connections + +During node execution (such as model and test), dbt opens connections against a Snowflake warehouse. Setting this configuration to `True` reduces execution time by verifying credentials only once for each thread. -New in v1.0.0. + + +### retry_on_database_errors The `retry_on_database_errors` flag along with the `connect_retries` count specification is intended to make retries configurable after the snowflake connector encounters errors of type snowflake.connector.errors.DatabaseError. These retries can be helpful for handling errors of type "JWT token is invalid" when using key pair authentication. ### retry_all -New in v1.0.0. - The `retry_all` flag along with the `connect_retries` count specification is intended to make retries configurable after the snowflake connector encounters any error. diff --git a/website/docs/reference/warehouse-setups/spark-setup.md b/website/docs/docs/core/connect-data-platform/spark-setup.md similarity index 83% rename from website/docs/reference/warehouse-setups/spark-setup.md rename to website/docs/docs/core/connect-data-platform/spark-setup.md index 14254683637..895f0559953 100644 --- a/website/docs/reference/warehouse-setups/spark-setup.md +++ b/website/docs/docs/core/connect-data-platform/spark-setup.md @@ -1,5 +1,6 @@ --- -title: "Apache Spark Profile" +title: "Apache Spark setup" +description: "Read this guide to learn about the Apache Spark warehouse setup in dbt." id: "spark-setup" meta: maintained_by: dbt Labs @@ -12,9 +13,17 @@ meta: slack_channel_name: 'db-databricks-and-spark' slack_channel_link: 'https://getdbt.slack.com/archives/CNGCW8HKL' platform_name: 'Spark' - config_page: 'spark-configs' + config_page: '/reference/resource-configs/spark-configs' --- + + + + +:::note +See [Databricks setup](#databricks-setup) for the Databricks version of this page. +::: +

        Overview of {frontMatter.meta.pypi_package}

          @@ -48,18 +57,14 @@ $ pip install "dbt-spark[ODBC]" $ pip install "dbt-spark[PyHive]" ``` - - ```zsh # session connections $ pip install "dbt-spark[session]" ``` - -

          Configuring {frontMatter.meta.pypi_package}

          -

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          +

          For {frontMatter.meta.platform_name}-specific configuration please refer to {frontMatter.meta.platform_name} Configuration

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          @@ -71,7 +76,6 @@ dbt-spark can connect to Spark clusters by three different methods: - [`thrift`](#thrift) connects directly to the lead node of a cluster, either locally hosted / on premise or in the cloud (e.g. Amazon EMR). - [`http`](#http) is a more generic method for connecting to a managed service that provides an HTTP endpoint. Currently, this includes connections to a Databricks interactive cluster. - - [`session`](#session) connects to a pySpark session, running locally or on a remote machine. @@ -79,12 +83,9 @@ dbt-spark can connect to Spark clusters by three different methods: The `session` connection method is intended for advanced users and experimental dbt development. This connection method is not supported by dbt Cloud. ::: - ### ODBC -New in v0.18.1 - Use the `odbc` connection method if you are connecting to a Databricks SQL endpoint or interactive cluster via ODBC driver. (Download the latest version of the official driver [here](https://databricks.com/spark/odbc-driver-download).) @@ -110,9 +111,7 @@ your_profile_name: port: [port] # default 443 user: [user] server_side_parameters: - # cluster configuration parameters, otherwise applied via `SET` statements - # for example: - # "spark.databricks.delta.schema.autoMerge.enabled": True + "spark.driver.memory": "4g" ``` @@ -139,6 +138,8 @@ your_profile_name: auth: [e.g. KERBEROS] kerberos_service_name: [e.g. hive] use_ssl: [true|false] # value of hive.server2.use.SSL, default false + server_side_parameters: + "spark.driver.memory": "4g" ``` @@ -167,6 +168,8 @@ your_profile_name: user: [user] connect_timeout: 60 # default 10 connect_retries: 5 # default 0 + server_side_parameters: + "spark.driver.memory": "4g" ``` @@ -175,8 +178,6 @@ Databricks interactive clusters can take several minutes to start up. You may include the optional profile configs `connect_timeout` and `connect_retries`, and dbt will periodically retry the connection. - - ### Session Use the `session` method if you want to run `dbt` against a pySpark session. @@ -192,14 +193,12 @@ your_profile_name: method: session schema: [database/schema name] host: NA # not used, but required by `dbt-core` + server_side_parameters: + "spark.driver.memory": "4g" ``` - - - - ## Optional configurations ### Retries @@ -218,6 +217,12 @@ connect_retries: 3 + + + +### Server side configuration + +Spark can be customized using [Application Properties](https://spark.apache.org/docs/latest/configuration.html). Using these properties the execution can be customized, for example, to allocate more memory to the driver process. Also, the Spark SQL runtime can be set through these properties. For example, this allows the user to [set a Spark catalogs](https://spark.apache.org/docs/latest/configuration.html#spark-sql). ## Caveats @@ -231,6 +236,6 @@ Most dbt Core functionality is supported, but some features are only available on Delta Lake (Databricks). Delta-only features: -1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](spark-configs#the-merge-strategy)) -2. [Snapshots](snapshots) -3. [Persisting](persist_docs) column-level descriptions as database comments +1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](/reference/resource-configs/spark-configs#the-merge-strategy)) +2. [Snapshots](/docs/build/snapshots) +3. [Persisting](/reference/resource-configs/persist_docs) column-level descriptions as database comments diff --git a/website/docs/reference/warehouse-setups/sqlite-setup.md b/website/docs/docs/core/connect-data-platform/sqlite-setup.md similarity index 98% rename from website/docs/reference/warehouse-setups/sqlite-setup.md rename to website/docs/docs/core/connect-data-platform/sqlite-setup.md index acdf1fd7e64..3da902a6f80 100644 --- a/website/docs/reference/warehouse-setups/sqlite-setup.md +++ b/website/docs/docs/core/connect-data-platform/sqlite-setup.md @@ -1,5 +1,6 @@ --- title: "SQLite setup" +description: "Read this guide to learn about the SQLite warehouse setup in dbt." id: "sqlite-setup" meta: maintained_by: Community @@ -12,7 +13,7 @@ meta: slack_channel_name: 'n/a' slack_channel_link: 'https://www.getdbt.com/community' platform_name: 'SQLite' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Community plugin diff --git a/website/docs/reference/warehouse-setups/teradata-setup.md b/website/docs/docs/core/connect-data-platform/teradata-setup.md similarity index 97% rename from website/docs/reference/warehouse-setups/teradata-setup.md rename to website/docs/docs/core/connect-data-platform/teradata-setup.md index 72b137097c5..1fe33ff8929 100644 --- a/website/docs/reference/warehouse-setups/teradata-setup.md +++ b/website/docs/docs/core/connect-data-platform/teradata-setup.md @@ -1,5 +1,6 @@ --- title: "Teradata setup" +description: "Read this guide to learn about the Teradata warehouse setup in dbt." id: "teradata-setup" meta: maintained_by: Teradata @@ -12,7 +13,7 @@ meta: slack_channel_name: '#db-teradata' slack_channel_link: 'https://getdbt.slack.com/archives/C027B6BHMT3' platform_name: 'Teradata' - config_page: 'teradata-configs' + config_page: '/reference/resource-configs/teradata-configs' --- Some core functionality may be limited. If you're interested in contributing, check out the source code for the repository listed below. @@ -49,7 +50,7 @@ pip is the easiest way to install the adapter: ### Connecting to Teradata -To connect to Teradata Vantage from dbt, you'll need to add a [profile](https://docs.getdbt.com/dbt-cli/configure-your-profile) to your `profiles.yml` file. A Teradata profile conforms to the following syntax: +To connect to Teradata Vantage from dbt, you'll need to add a [profile](https://docs.getdbt.com/docs/core/connection-profiles) to your `profiles.yml` file. A Teradata profile conforms to the following syntax: diff --git a/website/docs/reference/warehouse-setups/tidb-setup.md b/website/docs/docs/core/connect-data-platform/tidb-setup.md similarity index 96% rename from website/docs/reference/warehouse-setups/tidb-setup.md rename to website/docs/docs/core/connect-data-platform/tidb-setup.md index d1a5f887bdf..e2205c4665e 100644 --- a/website/docs/reference/warehouse-setups/tidb-setup.md +++ b/website/docs/docs/core/connect-data-platform/tidb-setup.md @@ -1,17 +1,20 @@ --- title: "TiDB setup" +description: "Read this guide to learn about the TiDB warehouse setup in dbt." id: "tidb-setup" meta: maintained_by: PingCAP authors: Xiang Zhang, Qiang Wu, Yuhang Shi github_repo: 'pingcap/dbt-tidb' pypi_package: 'dbt-tidb' + min_core_version: 'v1.0.0' core_version: 'v1.0.0 and newer' cloud_support: Not Supported + min_supported_version: 'n/a' slack_channel_name: '#db-tidb' slack_channel_link: 'https://getdbt.slack.com/archives/C03CC86R1NY' platform_name: 'TiDB' - config_page: 'no-configs' + config_page: '/reference/resource-configs/no-configs' --- :::info Vendor-supported plugin diff --git a/website/docs/docs/core/connect-data-platform/trino-setup.md b/website/docs/docs/core/connect-data-platform/trino-setup.md new file mode 100644 index 00000000000..39d8ed8ab3f --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/trino-setup.md @@ -0,0 +1,331 @@ +--- +title: "Starburst/Trino setup" +description: "Read this guide to learn about the Starburst/Trino warehouse setup in dbt." +id: "trino-setup" +meta: + maintained_by: Starburst Data, Inc. + authors: Marius Grama, Przemek Denkiewicz, Michiel de Smet + github_repo: 'starburstdata/dbt-trino' + pypi_package: 'dbt-trino' + min_core_version: 'v0.20.0' + cloud_support: 'Supported' + min_supported_version: 'n/a' + slack_channel_name: '#db-starburst-and-trino' + slack_channel_link: 'https://getdbt.slack.com/archives/CNNPBQ24R' + platform_name: 'Starburst/Trino' + config_page: '/reference/resource-configs/trino-configs' +--- + + + +

          Overview of {frontMatter.meta.pypi_package}

          + +
            +
          • Maintained by: {frontMatter.meta.maintained_by}
          • +
          • Authors: {frontMatter.meta.authors}
          • +
          • GitHub repo: {frontMatter.meta.github_repo}
          • +
          • PyPI package: {frontMatter.meta.pypi_package}
          • +
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • +
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • +
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • +
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • +
          + +:::info Vendor-supported plugin + +Certain core functionality may vary. If you would like to report a bug, request a feature, or contribute, you can check out the linked repository and open an issue. + +::: + +

          Installing {frontMatter.meta.pypi_package}

          + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          + +

          Configuring {frontMatter.meta.pypi_package}

          + +

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          + +

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          + +## Connecting to Starburst/Trino + +To connect to a data platform with dbt Core, create appropriate _profile_ and _target_ YAML keys/values in the `profiles.yml` configuration file for your Starburst/Trino clusters. This dbt YAML file lives in the `.dbt/` directory of your user/home directory. For more information, refer to [Connection profiles](/docs/core/connect-data-platform/connection-profiles) and [profiles.yml](/docs/core/connect-data-platform/profiles.yml). + +The parameters for setting up a connection are for Starburst Enterprise, Starburst Galaxy, and Trino clusters. Unless specified, "cluster" will mean any of these products' clusters. + +## Host parameters + +The following profile fields are always required except for `user`, which is also required unless you're using the `oauth`, `cert`, or `jwt` authentication methods. + +| Field | Example | Description | +| --------- | ------- | ----------- | +| `host` | `mycluster.mydomain.com` | The hostname of your cluster.

          Don't include the `http://` or `https://` prefix. | +| `database` | `my_postgres_catalog` | The name of a catalog in your cluster. | +| `schema` | `my_schema` | The name of a schema within your cluster's catalog.

          It's _not recommended_ to use schema names that have upper case or mixed case letters. | +| `port` | `443` | The port to connect to your cluster. By default, it's 443 for TLS enabled clusters. | +| `user` | Format for Starburst Enterprise or Trino:
          • `user.name`
          • `user.name@mydomain.com`

          Format for Starburst Galaxy:
          • `user.name@mydomain.com/role`
          | The username (of the account) to log in to your cluster. When connecting to Starburst Galaxy clusters, you must include the role of the user as a suffix to the username. | + +### Roles in Starburst Enterprise + + +### Schemas and databases + + +## Additional parameters + +The following profile fields are optional to set up. They let you configure your cluster's session and dbt for your connection. + + +| Profile field | Example | Description | +| ----------------------------- | -------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `threads` | `8` | How many threads dbt should use (default is `1`) | +| `roles` | `system: analyst` | Catalog roles can be set under the optional `roles` parameter using the following format: `catalog: role`. | +| `session_properties` | `query_max_run_time: 4h` | Sets Trino session properties used in the connection. Execute `SHOW SESSION` to see available options | +| `prepared_statements_enabled` | `true` or `false` | Enable usage of Trino prepared statements (used in `dbt seed` commands) (default: `true`) | +| `retries` | `10` | Configure how many times all database operation is retried when connection issues arise (default: `3`) | +| `timezone` | `Europe/Brussels` | The time zone for the Trino session (default: client-side local timezone) | +| `http_headers` | `X-Trino-Client-Info: dbt-trino` | HTTP Headers to send alongside requests to Trino, specified as a YAML dictionary of (header, value) pairs. | +| `http_scheme` | `https` or `http` | The HTTP scheme to use for requests to Trino (default: `http`, or `https` if `kerberos`, `ldap` or `jwt`) | + +## Authentication parameters + +The authentication methods that dbt Core supports are: + +- `ldap` — LDAP (username and password) +- `kerberos` — Kerberos +- `jwt` — JSON Web Token (JWT) +- `certificate` — Certificate-based authentication +- `oauth` — Open Authentication (OAuth) +- `none` — None, no authentication + +Set the `method` field to the authentication method you intend to use for the connection. For a high-level introduction to authentication in Trino, see [Trino Security: Authentication types](https://trino.io/docs/current/security/authentication-types.html). + +Click on one of these authentication methods for further details on how to configure your connection profile. Each tab also includes an example `profiles.yml` configuration file for you to review. + + + + + +The following table lists the authentication parameters to set for LDAP. + +For more information, refer to [LDAP authentication](https://trino.io/docs/current/security/ldap.html) in the Trino docs. + +| Profile field | Example | Description | +| ------------- | ------- | ------------ | +| `method` | `ldap`| Set LDAP as the authentication method. | +| `user` | Format for Starburst Enterprise or Trino:
          • `user.name`
          • `user.name@mydomain.com`

          Format for Starburst Galaxy:
          • `user.name@mydomain.com/role`
          | The username (of the account) to log in to your cluster. When connecting to Starburst Galaxy clusters, you must include the role of the user as a suffix to the username. | +| `password` | `abc123` | Password for authentication. | +| `impersonation_user` (optional) | `impersonated_tom` | Override the provided username. This lets you impersonate another user. | +
          + +#### Example profiles.yml for LDAP + + + +```yaml +trino: + target: dev + outputs: + dev: + type: trino + method: ldap + user: [user] + password: [password] + host: [hostname] + database: [database name] + schema: [your dbt schema] + port: [port number] + threads: [1 or more] +``` + + + +
          + + + +The following table lists the authentication parameters to set for Kerberos. + +For more information, refer to [Kerberos authentication](https://trino.io/docs/current/security/kerberos.html) in the Trino docs. + +| Profile field | Example | Description | +| ------------------------------------------- | ------------------- | ---------------------------------------------------------------- | +| `method` | `kerberos`| Set Kerberos as the authentication method. | +| `user` | `commander` | Username for authentication | +| `keytab` | `/tmp/trino.keytab` | Path to keytab | +| `krb5_config` | `/tmp/krb5.conf` | Path to config | +| `principal` | `trino@EXAMPLE.COM` | Principal | +| `service_name` (optional) | `abc123` | Service name (default is `trino`) | +| `hostname_override` (optional) | `EXAMPLE.COM` | Kerberos hostname for a host whose DNS name doesn't match | +| `mutual_authentication` (optional) | `false` | Boolean flag for mutual authentication | +| `force_preemptive` (optional) | `false` | Boolean flag to preemptively initiate the Kerberos GSS exchange | +| `sanitize_mutual_error_response` (optional) | `true` | Boolean flag to strip content and headers from error responses | +| `delegate` (optional) | `false` | Boolean flag for credential delegation (`GSS_C_DELEG_FLAG`) | + +
          + +#### Example profiles.yml for Kerberos + + + +```yaml +trino: + target: dev + outputs: + dev: + type: trino + method: kerberos + user: commander + keytab: /tmp/trino.keytab + krb5_config: /tmp/krb5.conf + principal: trino@EXAMPLE.COM + host: trino.example.com + port: 443 + database: analytics + schema: public +``` + + + +
          + + + +The following table lists the authentication parameters to set for JSON Web Token. + +For more information, refer to [JWT authentication](https://trino.io/docs/current/security/jwt.html) in the Trino docs. + +| Profile field | Example | Description | +| -------------------- | -------------- | -------------------------- | +| `method` | `jwt`| Set JWT as the authentication method. | +| `jwt_token` | `aaaaa.bbbbb.ccccc` | The JWT string. | + +
          + +#### Example profiles.yml for JWT + + + +```yaml +trino: + target: dev + outputs: + dev: + type: trino + method: jwt + jwt_token: [my_long_jwt_token_string] + host: [hostname] + database: [database name] + schema: [your dbt schema] + port: [port number] + threads: [1 or more] +``` + + + +
          + + + +The following table lists the authentication parameters to set for certificates. + +For more information, refer to [Certificate authentication](https://trino.io/docs/current/security/certificate.html) in the Trino docs. + +| Profile field | Example | Description | +| -------------------- | -------------- | ----------------------------------- | +| `method` | `certificate`| Set certificate-based authentication as the method | +| `client_certificate` | `/tmp/tls.crt` | Path to client certificate | +| `client_private_key` | `/tmp/tls.key` | Path to client private key | +| `cert` | | The full path to a certificate file | + +
          + +#### Example profiles.yml for certificate + + + +```yaml +trino: + target: dev + outputs: + dev: + type: trino + method: certificate + cert: [path/to/cert_file] + client_certificate: [path/to/client/cert] + client_private_key: [path to client key] + database: [database name] + schema: [your dbt schema] + port: [port number] + threads: [1 or more] +``` + + + +
          + + + +The only authentication parameter to set for OAuth 2.0 is `method: oauth`. If you're using Starburst Enterprise or Starburst Galaxy, you must enable OAuth 2.0 in Starburst before you can use this authentication method. + +For more information, refer to both [OAuth 2.0 authentication](https://trino.io/docs/current/security/oauth2.html) in the Trino docs and the [README](https://github.com/trinodb/trino-python-client#oauth2-authentication) for the Trino Python client. + +It's recommended that you install `keyring` to cache the OAuth 2.0 token over multiple dbt invocations by running `pip install 'trino[external-authentication-token-cache]'`. The `keyring` package is not installed by default. + +#### Example profiles.yml for OAuth + +```yaml +sandbox-galaxy: + target: oauth + outputs: + oauth: + type: trino + method: oauth + host: bunbundersders.trino.galaxy-dev.io + catalog: dbt_target + schema: dataders + port: 433 +``` + + + + + +You don't need to set up authentication (`method: none`), however, dbt Labs strongly discourages people from using it in any real application. Its use case is only for toy purposes (as in to play around with it), like local examples such as running Trino and dbt entirely within a single Docker container. + +#### Example profiles.yml for no authentication + + + +```yaml +trino: + target: dev + outputs: + dev: + type: trino + method: none + user: commander + host: trino.example.com + port: 443 + database: analytics + schema: public +``` + + + + +
          diff --git a/website/docs/docs/core/connect-data-platform/upsolver-setup.md b/website/docs/docs/core/connect-data-platform/upsolver-setup.md new file mode 100644 index 00000000000..6b2f410fc07 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/upsolver-setup.md @@ -0,0 +1,85 @@ +--- +title: "Upsolver setup" +description: "Read this guide to learn how to configure Upsolver with dbt." +id: "upsolver-setup" +meta: + maintained_by: Upsolver Team + authors: Upsolver Team + github_repo: 'Upsolver/dbt-upsolver' + pypi_package: 'dbt-upsolver' + min_core_version: 'v1.5.0' + cloud_support: Not Supported + min_supported_version: 'n/a' + slack_channel_name: 'Upsolver Comunity' + slack_channel_link: 'https://join.slack.com/t/upsolvercommunity/shared_invite/zt-1zo1dbyys-hj28WfaZvMh4Z4Id3OkkhA' + platform_name: 'Upsolver' + config_page: '/reference/resource-configs/upsolver-configs' +pagination_next: null +--- + +

          Overview of {frontMatter.meta.pypi_package}

          + +
            +
          • Maintained by: {frontMatter.meta.maintained_by}
          • +
          • Authors: {frontMatter.meta.authors}
          • +
          • GitHub repo: {frontMatter.meta.github_repo}
          • +
          • PyPI package: {frontMatter.meta.pypi_package}
          • +
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • +
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • +
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • +
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • +
          +

          Installing {frontMatter.meta.pypi_package}

          + +pip is the easiest way to install the adapter: + +pip install {frontMatter.meta.pypi_package} + +

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          + +

          Configuring {frontMatter.meta.pypi_package}

          + +

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          + +

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          + +## Authentication Methods + +### User / Token authentication + +Upsolver can be configured using basic user/token authentication as shown below. + + + +```yaml +my-upsolver-db: + target: dev + outputs: + dev: + type: upsolver + api_url: https://mt-api-prod.upsolver.com + + user: [username] + token: [token] + + database: [database name] + schema: [schema name] + threads: [1 or more] + + ``` + + + +## Configurations + +The configs for Upsolver targets are shown below. + +### All configurations + +| Config | Required? | Description | +| ------ | --------- | ----------- | +| token | Yes | The token to connect Upsolver [Upsolver's documentation](https://docs.upsolver.com/sqlake/api-integration) | +| user | Yes | The user to log in as | +| database | Yes | The database that dbt should create models in | +| schema | Yes | The schema to build models into by default | +| api_url | Yes | The API url to connect. Common value ```https://mt-api-prod.upsolver.com``` | diff --git a/website/docs/docs/core/connect-data-platform/vertica-setup.md b/website/docs/docs/core/connect-data-platform/vertica-setup.md new file mode 100644 index 00000000000..fbb8de6b301 --- /dev/null +++ b/website/docs/docs/core/connect-data-platform/vertica-setup.md @@ -0,0 +1,105 @@ +--- +title: "Vertica setup" +id: "vertica-setup" +meta: + maintained_by: 'Vertica' + authors: 'Vertica (Former authors: Matthew Carter, Andy Regan, Andrew Hedengren)' + github_repo: 'vertica/dbt-vertica' + pypi_package: 'dbt-vertica' + min_core_version: 'v1.4.0 and newer' + cloud_support: 'Not Supported' + min_supported_version: 'Vertica 12.0.0' + slack_channel_name: 'n/a' + slack_channel_link: 'https://www.getdbt.com/community/' + platform_name: 'Vertica' + config_page: '/reference/resource-configs/vertica-configs' +--- + +:::info VENDOR-SUPPORTED PLUGIN + +If you're interested in contributing, check out the source code for each repository listed below. + +::: + +

          Overview of {frontMatter.meta.pypi_package}

          + +
            +
          • Maintained by: {frontMatter.meta.maintained_by}
          • +
          • Authors: {frontMatter.meta.authors}
          • +
          • GitHub repo: {frontMatter.meta.github_repo}
          • +
          • PyPI package: {frontMatter.meta.pypi_package}
          • +
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • +
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • +
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • +
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • +
          + + +

          Installing {frontMatter.meta.pypi_package}

          + +pip is the easiest way to install the adapter: pip install {frontMatter.meta.pypi_package} + +

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          + +

          Configuring {frontMatter.meta.pypi_package}

          + +

          For {frontMatter.meta.pypi_package} specific configuration please refer to {frontMatter.meta.platform_name} Configuration.

          + +

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}.

          + + +

          Connecting to {frontMatter.meta.platform_name} with {frontMatter.meta.pypi_package}

          + +#### Username / password authentication + +Configure your dbt profile for using Vertica: + +##### Vertica connection information + + + +```yaml +your-profile: + outputs: + dev: + type: vertica # Don't change this! + host: [hostname] + port: [port] # or your custom port (optional) + username: [your username] + password: [your password] + database: [database name] + schema: [dbt schema] + connection_load_balance: True + backup_server_node: [list of backup hostnames or IPs] + retries: [1 or more] + threads: [1 or more] + target: dev +``` + + + + +##### Description of Profile Fields: + + + + +| Property | Description | Required? |Default Value |Example | +|--------------------------------|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|----------------------------|----------------------------------| +|type | The specific adapter to use. | Yes | None | vertica +| host | The host name or IP address of any active node in the Vertica Server. |Yes | None | 127.0.0.1 +| port | The port to use, default or custom. | Yes | 5433 |5433 +| username | The username to use to connect to the server. | Yes | None | dbadmin| +password |The password to use for authenticating to the server. |Yes|None|my_password| +database |The name of the database running on the server. |Yes | None | my_db | +schema| The schema to build models into.| No| None |VMart| +connection_load_balance| A Boolean value that indicates whether the connection can be redirected to a host in the database other than host.| No| True |True| +backup_server_node| List of hosts to connect to if the primary host specified in the connection (host, port) is unreachable. Each item in the list should be either a host string (using default port 5433) or a (host, port) tuple. A host can be a host name or an IP address.| No| None |['123.123.123.123','www.abc.com',('123.123.123.124',5433)]| +retries |The retry times after an unsuccessful connection.| No| 2 |3| +threads |The number of threads the dbt project will run on.| No| 1| 3| +label| A session label to identify the connection. |No |An auto-generated label with format of: dbt_username |dbt_dbadmin| + + +For more information on Vertica’s connection properties please refer to [Vertica-Python](https://github.com/vertica/vertica-python#create-a-connection) Connection Properties. + + diff --git a/website/docs/docs/core/dbt-core-environments.md b/website/docs/docs/core/dbt-core-environments.md new file mode 100644 index 00000000000..c7f340557fd --- /dev/null +++ b/website/docs/docs/core/dbt-core-environments.md @@ -0,0 +1,17 @@ +--- +title: "dbt Core environments" +id: "dbt-core-environments" +pagination_next: "docs/running-a-dbt-project/run-your-dbt-projects" +--- + +dbt makes it easy to maintain separate production and development environments through the use of [targets](/reference/dbt-jinja-functions/target.md) within a [profile](/docs/core/connect-data-platform/profiles.yml). A typical profile, when using dbt locally (for example, running from your command line), will have a target named `dev` and have this set as the default. This means that while making changes, your objects will be built in your _development_ target without affecting production queries made by your end users. Once you are confident in your changes, you can deploy the code to _production_, by running your dbt project with a _prod_ target. + +:::info Running dbt in production + +You can learn more about different ways to run dbt in production in [this article](/docs/deploy/deployments). + +::: + +Targets offer the flexibility to decide how to implement your separate environments – whether you want to use separate schemas, databases, or entirely different clusters altogether! We recommend using _different schemas within one database_ to separate your environments. This is the easiest to set up and is the most cost-effective solution in a modern cloud-based data stack. + +In practice, this means that most of the details in a target will be consistent across all targets, except for the `schema` and user credentials. If you have multiple dbt users writing code, it often makes sense for _each user_ to have their own _development_ environment. A pattern we've found useful is to set your dev target schema to be `dbt_`. User credentials should also differ across targets so that each dbt user is using their own data warehouse user. diff --git a/website/docs/docs/get-started/docker-install.md b/website/docs/docs/core/docker-install.md similarity index 83% rename from website/docs/docs/get-started/docker-install.md rename to website/docs/docs/core/docker-install.md index 0a34cf4ec01..dfb2a669e34 100644 --- a/website/docs/docs/get-started/docker-install.md +++ b/website/docs/docs/core/docker-install.md @@ -3,16 +3,16 @@ title: "Install with Docker" description: "You can use Docker to install dbt and adapter plugins from the command line." --- -dbt Core and all adapter plugins maintained by dbt Labs are available as [Docker](https://docs.docker.com/) images, and distributed via [GitHub Packages](https://docs.github.com/en/packages/learn-github-packages/introduction-to-github-packages). +dbt Core and all adapter plugins maintained by dbt Labs are available as [Docker](https://docs.docker.com/) images, and distributed via [GitHub Packages](https://docs.github.com/en/packages/learn-github-packages/introduction-to-github-packages) in a [public registry](https://github.com/dbt-labs/dbt-core/pkgs/container/dbt-core). Using a prebuilt Docker image to install dbt Core in production has a few benefits: it already includes dbt-core, one or more database adapters, and pinned versions of all their dependencies. By contrast, `pip install dbt-core dbt-` takes longer to run, and will always install the latest compatible versions of every dependency. -You might also be able to use Docker to install and develop locally if you don't have a Python environment set up. Note that running dbt in this manner can be significantly slower if your operating system differs from the system that built the Docker image. If you're a frequent local developer, we recommend that you install dbt Core via [Homebrew](/docs/get-started/homebrew-install) or [pip](/docs/get-started/pip-install) instead. +You might also be able to use Docker to install and develop locally if you don't have a Python environment set up. Note that running dbt in this manner can be significantly slower if your operating system differs from the system that built the Docker image. If you're a frequent local developer, we recommend that you install dbt Core via [Homebrew](/docs/core/homebrew-install) or [pip](/docs/core/pip-install) instead. ### Prerequisites * You've installed Docker. For more information, see the [Docker](https://docs.docker.com/) site. -* You understand which database adapter(s) you need. For more information, see [About dbt adapters](/docs/get-started/installation#about-dbt-adapters). -* You understand how dbt Core is versioned. For more information, see [About dbt Core versions](core-versions). +* You understand which database adapter(s) you need. For more information, see [About dbt adapters](/docs/core/installation#about-dbt-adapters). +* You understand how dbt Core is versioned. For more information, see [About dbt Core versions](/docs/dbt-versions/core). * You have a general understanding of the dbt, dbt workflow, developing locally in the command line interface (CLI). For more information, see [About dbt](/docs/introduction#how-do-i-use-dbt). ### Install a dbt Docker image from Github Packages @@ -51,4 +51,4 @@ In particular, the Dockerfile supports building images: - Images that install one or more third-party adapters - Images against another system architecture -Please note that, if you go the route of building your own Docker images, we are unable to offer dedicated support for custom use cases. If you run into problems, you are welcome to [ask the community for help](getting-help) or [open an issue](oss-expectations#issues) in the `dbt-core` repository. If many users are requesting the same enhancement, we will tag the issue `help_wanted` and invite community contribution. +Please note that, if you go the route of building your own Docker images, we are unable to offer dedicated support for custom use cases. If you run into problems, you are welcome to [ask the community for help](/community/resources/getting-help) or [open an issue](/community/resources/oss-expectations#issues) in the `dbt-core` repository. If many users are requesting the same enhancement, we will tag the issue `help_wanted` and invite community contribution. diff --git a/website/docs/docs/get-started/homebrew-install.md b/website/docs/docs/core/homebrew-install.md similarity index 79% rename from website/docs/docs/get-started/homebrew-install.md rename to website/docs/docs/core/homebrew-install.md index 7617ff6d03f..2e2676c4a95 100644 --- a/website/docs/docs/get-started/homebrew-install.md +++ b/website/docs/docs/core/homebrew-install.md @@ -3,6 +3,13 @@ title: "Install with Homebrew" description: "You can use Homebrew to install dbt Core and adapter plugins from the command line." --- +:::caution + +Starting with v1.6, dbt Labs will no longer maintain Homebrew formulae as a supported installation method for dbt-core and adapters. For more on our rationale, consult this discussion: +- [Installing dbt Core: saying goodbye to brew and hello to "bundles"](https://github.com/dbt-labs/dbt-core/discussions/8277) + +::: + dbt Labs maintains Homebrew formulae for the four oldest and most popular adapter plugins: Postgres, Redshift, Snowflake, and BigQuery. We recommend you use Homebrew if you meet these conditions: @@ -16,7 +23,7 @@ If that sounds like you, great! Homebrew makes it significantly easier to instal - Installation with Homebrew can take longer than installing with other methods, because `brew` takes care of more setup behind the scenes - If you're using an M1 Mac, we recommend that you install dbt via Homebrew with [Rosetta](https://support.apple.com/en-us/HT211861). This is necessary for certain dependencies that are only supported on Intel processors. -If you're someone who prefers to manage Python environments yourself, such as having multiple versions of Python to switch between, we recommend you install dbt Core via [`pip` instead](/docs/get-started/pip-install). +If you're someone who prefers to manage Python environments yourself, such as having multiple versions of Python to switch between, we recommend you install dbt Core via [`pip` instead](/docs/core/pip-install). ### Installing with Homebrew @@ -28,7 +35,7 @@ brew install git brew tap dbt-labs/dbt ``` -Now you're ready to install dbt. Once you know [which adapter](supported-data-platforms) you're using, you can install it as `dbt-`. For instance, if using Postgres: +Now you're ready to install dbt. Once you know [which adapter](/docs/supported-data-platforms) you're using, you can install it as `dbt-`. For instance, if using Postgres: ```shell brew install dbt-postgres @@ -36,7 +43,7 @@ brew install dbt-postgres Everywhere below that you see ``, replace it with the adapter name you're using. -**Note**: If you're using an adapter that isn't available as a Homebrew formula, we recommend you use [pip](/docs/get-started/pip-install) instead. +**Note**: If you're using an adapter that isn't available as a Homebrew formula, we recommend you use [pip](/docs/core/pip-install) instead. ### Upgrading dbt and your adapter diff --git a/website/docs/docs/core/installation-overview.md b/website/docs/docs/core/installation-overview.md new file mode 100644 index 00000000000..25628b76358 --- /dev/null +++ b/website/docs/docs/core/installation-overview.md @@ -0,0 +1,34 @@ +--- +title: "About installing dbt" +id: "installation" +description: "You can install dbt Core using a few different tested methods." +pagination_next: "docs/core/homebrew-install" +pagination_prev: null +--- + +You can install dbt Core on the command line by using one of these methods: + +- [Use pip to install dbt](/docs/core/pip-install) (recommended) +- [Use Homebrew to install dbt](/docs/core/homebrew-install) +- [Use a Docker image to install dbt](/docs/core/docker-install) +- [Install dbt from source](/docs/core/source-install) + +:::tip Pro tip: Using the --help flag + +Most command-line tools, including dbt, have a `--help` flag that you can use to show available commands and arguments. For example, you can use the `--help` flag with dbt in two ways:

          +— `dbt --help`: Lists the commands available for dbt
          +— `dbt run --help`: Lists the flags available for the `run` command + +::: + +## Upgrading dbt Core + +dbt provides a number of resources for understanding [general best practices](/blog/upgrade-dbt-without-fear) while upgrading your dbt project as well as detailed [migration guides](/guides/migration/versions/upgrading-to-v1.4) highlighting the changes required for each minor and major release, and [core versions](/docs/dbt-versions/core) + +- [Upgrade Homebrew](/docs/core/homebrew-install#upgrading-dbt-and-your-adapter) +- [Upgrade `pip`](/docs/core/pip-install#change-dbt-core-versions) + + +## About dbt data platforms and adapters + +dbt works with a number of different data platforms (databases, query engines, and other SQL-speaking technologies). It does this by using a dedicated _adapter_ for each. When you install dbt Core, you'll also want to install the specific adapter for your database. For more details, see [Supported Data Platforms](/docs/supported-data-platforms). diff --git a/website/docs/docs/core/pip-install.md b/website/docs/docs/core/pip-install.md new file mode 100644 index 00000000000..44fac00e493 --- /dev/null +++ b/website/docs/docs/core/pip-install.md @@ -0,0 +1,107 @@ +--- +title: "Install with pip" +description: "You can use pip to install dbt Core and adapter plugins from the command line." +--- + +You need to use `pip` to install dbt Core on Windows or Linux operating systems. You can use `pip` or [Homebrew](/docs/core/homebrew-install) for installing dbt Core on a MacOS. + +You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt-core/). + + + + +### Using virtual environments +We recommend using virtual environments (venv) to namespace pip modules. + +1. Create a new venv: + +```shell +python3 -m venv dbt-env # create the environment +``` + +2. Activate that same virtual environment each time you create a shell window or session: + +```shell +source dbt-env/bin/activate # activate the environment for Mac and Linux OR +dbt-env\Scripts\activate # activate the environment for Windows +``` + +#### Create an alias +To activate your dbt environment with every new shell window or session, you can create an alias for the source command in your $HOME/.bashrc, $HOME/.zshrc, or whichever config file your shell draws from. + +For example, add the following to your rc file, replacing with the path to your virtual environment configuration. + +```shell +alias env_dbt='source /bin/activate' +``` + +### Installing the adapter +Once you know [which adapter](/docs/supported-data-platforms) you're using, you can install it as `dbt-`. For example, if using Postgres: + +```shell +pip install dbt-postgres +``` + +This will install `dbt-core` and `dbt-postgres` _only_: + +```shell +$ dbt --version +installed version: 1.0.0 + latest version: 1.0.0 + +Up to date! + +Plugins: + - postgres: 1.0.0 +``` + +All adapters build on top of `dbt-core`. Some also depend on other adapters: for example, `dbt-redshift` builds on top of `dbt-postgres`. In that case, you would see those adapters included by your specific installation, too. + +### Upgrade adapters + +To upgrade a specific adapter plugin: + +```shell +pip install --upgrade dbt- +``` + +### Install dbt-core only + +If you're building a tool that integrates with dbt Core, you may want to install the core library alone, without a database adapter. Note that you won't be able to use dbt as a CLI tool. + +```shell +pip install dbt-core +``` +### Change dbt Core versions + +You can upgrade or downgrade versions of dbt Core by using the `--upgrade` option on the command line (CLI). For more information, see [Best practices for upgrading in Core versions](/docs/dbt-versions/core#best-practices-for-upgrading). + +To upgrade dbt to the latest version: + +``` +pip install --upgrade dbt-core +``` + +To downgrade to an older version, specify the version you want to use. This command can be useful when you're resolving package dependencies. As an example: + +``` +pip install --upgrade dbt-core==0.19.0 +``` + +### `pip install dbt` + +Note that, as of v1.0.0, `pip install dbt` is no longer supported and will raise an explicit error. Since v0.13, the PyPI package named `dbt` was a simple "pass-through" of `dbt-core` and the four original database adapter plugins. For v1, we formalized that split. + +If you have workflows or integrations that relied on installing the package named `dbt`, you can achieve the same behavior going forward by installing the same five packages that it used: + +```shell +pip install \ + dbt-core \ + dbt-postgres \ + dbt-redshift \ + dbt-snowflake \ + dbt-bigquery \ + dbt-trino +``` + +Or, better yet, just install the package(s) you need! diff --git a/website/docs/docs/get-started/source-install.md b/website/docs/docs/core/source-install.md similarity index 92% rename from website/docs/docs/get-started/source-install.md rename to website/docs/docs/core/source-install.md index 6714e88cd10..42086159c03 100644 --- a/website/docs/docs/get-started/source-install.md +++ b/website/docs/docs/core/source-install.md @@ -1,6 +1,7 @@ --- title: "Install from source" description: "You can install dbt Core from its GitHub code source." +pagination_next: null --- dbt Core and almost all of its adapter plugins are open source software. As such, the codebases are freely available to download and build from source. You might install from source if you want the latest code or want to install dbt from a specific commit. This might be helpful when you are contributing changes, or if you want to debug a past change. @@ -35,6 +36,6 @@ You do _not_ need to install `dbt-core` before installing an adapter plugin -- t To install in editable mode, such as while contributing, use `pip install -e .` instead. - - - + + + diff --git a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md index 499d2e904c7..168ec0c80f4 100644 --- a/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md +++ b/website/docs/docs/dbt-cloud-apis/admin-cloud-api.md @@ -1,13 +1,36 @@ --- title: "dbt Cloud Administrative API" id: "admin-cloud-api" +pagination_next: "docs/dbt-cloud-apis/discovery-api" --- -The dbt Cloud Administrative API is enabled by default for _Team_ and _Enterprise_ plans. It can be used to: +The dbt Cloud Administrative API is enabled by default for [Team and Enterprise plans](https://www.getdbt.com/pricing/). It can be used to: - Download artifacts after a job has completed - Kick off a job run from an orchestration tool - Manage your dbt Cloud account - and more -Reference documentation for the dbt Cloud Administrative v2 API can be found [here](/dbt-cloud/api-v2). +dbt Cloud currently supports two versions of the Administrative API: v2 and v3. In general, v3 is the recommended version to use, but we don't yet have all our v2 routes upgraded to v3. We're currently working on this. If you can't find something in our v3 docs, check out the shorter list of v2 endpoints because you might find it there. + +
          + + + + + + + +
          diff --git a/website/docs/docs/dbt-cloud-apis/apis-overview.md b/website/docs/docs/dbt-cloud-apis/apis-overview.md index fc8e7c76009..eef64992af9 100644 --- a/website/docs/docs/dbt-cloud-apis/apis-overview.md +++ b/website/docs/docs/dbt-cloud-apis/apis-overview.md @@ -1,17 +1,22 @@ --- title: "APIs Overview" +description: "Learn how dbt accounts on the Team and Enterprise plans can query the dbt Cloud APIs." id: "overview" +pagination_next: "docs/dbt-cloud-apis/user-tokens" +pagination_prev: null --- ## Overview Accounts on the _Team_ and _Enterprise_ plans can query the dbt Cloud APIs. -dbt Cloud provides two APIs. +dbt Cloud provides the following APIs: -The [dbt Cloud Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) can be used to administrate a dbt Cloud account. +- The [dbt Cloud Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) can be used to administrate a dbt Cloud account. +- The [dbt Cloud Discovery API](/docs/dbt-cloud-apis/discovery-api) can be used to fetch metadata related to the state and health of your dbt project. +- The [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) provides multiple API options which allow you to query your metrics defined in the dbt Semantic Layer. -The [dbt Metadata API](docs/dbt-cloud-apis/metadata-api) can be used to fetch metadata related to the state and health of your dbt project. +If you want to learn more about webhooks, refer to [Webhooks for your jobs](/docs/deploy/webhooks). ## How to Access the APIs diff --git a/website/docs/docs/dbt-cloud-apis/authentication.md b/website/docs/docs/dbt-cloud-apis/authentication.md new file mode 100644 index 00000000000..7deadd68f18 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/authentication.md @@ -0,0 +1,22 @@ +--- +title: "Authentication" +description: "Learn how to authenticate with user tokens and service account tokens " +pagination_next: "docs/dbt-cloud-apis/user-tokens" +pagination_prev: null +--- + +
          + + + + + +
          \ No newline at end of file diff --git a/website/docs/docs/dbt-cloud-apis/discovery-api.md b/website/docs/docs/dbt-cloud-apis/discovery-api.md new file mode 100644 index 00000000000..747128cf7bc --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/discovery-api.md @@ -0,0 +1,90 @@ +--- +title: "About the Discovery API" +pagination_next: "docs/dbt-cloud-apis/discovery-use-cases-and-examples" +--- + +Every time dbt Cloud runs a project, it generates and stores information about the project. The metadata includes details about your project’s models, sources, and other nodes along with their execution results. With the dbt Cloud Discovery API, you can query this comprehensive information to gain a better understanding of your DAG and the data it produces. + +By leveraging the metadata in dbt Cloud, you can create systems for data monitoring and alerting, lineage exploration, and automated reporting. This can help you improve data discovery, data quality, and pipeline operations within your organization. + +You can access the Discovery API through [ad hoc queries](/docs/dbt-cloud-apis/discovery-querying), custom applications, a wide range of [partner ecosystem integrations](https://www.getdbt.com/product/integrations/) (like BI/analytics, catalog and governance, and quality and observability), and by using dbt Cloud features like [model timing](/docs/deploy/run-visibility#model-timing) and [dashboard status tiles](/docs/deploy/dashboard-status-tiles). + + + + +You can query the dbt Cloud metadata: + +- At the [environment](/docs/environments-in-dbt) level for both the latest state (use the `environment` endpoint) and historical run results (use `modelByEnvironment`) of a dbt Cloud project in production. +- At the job level for results on a specific dbt Cloud job run for a given resource type, like `models` or `test`. + +:::tip Public Preview +The Discovery API is currently available in Public Preview for dbt Cloud accounts on a Team or Enterprise plan. It’s available to all multi-tenant and to only select single-tenant accounts (please ask your account team to confirm). Preview features are stable and can be considered for production deployments, but there might still be some planned additions and modifications to product behavior before moving to General Availability. For details, refer to [dbt Product lifecycles](/docs/dbt-versions/product-lifecycles). + +::: + +## What you can use the Discovery API for + +Click the tabs below to learn more about the API's use cases, the analysis you can do, and the results you can achieve by integrating with it. + +To use the API directly or integrate your tool with it, refer to [Uses case and examples](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) for detailed information. + + + + + +Use the API to look at historical information like model build time to determine the health of your dbt projects. Finding inefficiencies in orchestration configurations can help decrease infrastructure costs and improve timeliness. To learn more about how to do this, refer to [Performance](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#performance). + +You can use, for example, the [model timing](/docs/deploy/run-visibility#model-timing) tab to help identify and optimize bottlenecks in model builds: + + + + + + + +Use the API to determine if the data is accurate and up-to-date by monitoring test failures, source freshness, and run status. Accurate and reliable information is valuable for analytics, decisions, and monitoring to help prevent your organization from making bad decisions. To learn more about this, refer to [Quality](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#quality). + +When used with [webhooks](/docs/deploy/webhooks), it can also help with detecting, investigating, and alerting issues. + + + + + +Use the API to find and understand dbt assets in integrated tools using information like model and metric definitions, and column information. For more details, refer to [Discovery](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#discovery). + +Data producers must manage and organize data for stakeholders, while data consumers need to quickly and confidently analyze data on a large scale to make informed decisions that improve business outcomes and reduce organizational overhead. The API is useful for discovery data experiences in catalogs, analytics, apps, and machine learning (ML) tools. It can help you understand the origin and meaning of datasets for your analysis. + + + + + + +Use the API to review who developed the models and who uses them to help establish standard practices for better governance. For more details, refer to [Governance](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#governance). + + + + + +Use the API to review dataset changes and uses by examining exposures, lineage, and dependencies. From the investigation, you can learn how to define and build more effective dbt projects. For more details, refer to [Development](/docs/dbt-cloud-apis/discovery-use-cases-and-examples#development). + + + + + + + + +## Types of project state + +There are two types of [project state](/docs/dbt-cloud-apis/project-state) at the environment level that you can query the results of: + +- **Definition** — The logical state of a dbt project’s [resources](/docs/build/projects) that update when the project is changed. +- **Applied** — The output of successful dbt DAG execution that creates or describes the state of the database (for example: `dbt run`, `dbt test`, source freshness, and so on) + +These states allow you to easily examine the difference between a model’s definition and its applied state so you can get answers to questions like, did the model run? or did the run fail? Applied models exist as a table/view in the data platform given their most recent successful run. + +## Related docs + +- [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) +- [Query the Discovery API](/docs/dbt-cloud-apis/discovery-querying) +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-querying.md b/website/docs/docs/dbt-cloud-apis/discovery-querying.md new file mode 100644 index 00000000000..35c092adb4b --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/discovery-querying.md @@ -0,0 +1,234 @@ +--- +title: "Query the Discovery API" +id: "discovery-querying" +sidebar_label: "Query the Discovery API" +pagination_next: "docs/dbt-cloud-apis/discovery-schema-environment" +--- + +The Discovery API supports ad-hoc queries and integrations. If you are new to the API, refer to [About the Discovery API](/docs/dbt-cloud-apis/discovery-api) for an introduction. + +Use the Discovery API to evaluate data pipeline health and project state across runs or at a moment in time. dbt Labs provide a [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql) for this API, enabling you to run queries and browse the schema. + +Since GraphQL describes the data in the API, the schema displayed in the GraphQL explorer accurately represents the graph and fields available to query. + + + +## Authorization + +Currently, authorization of requests takes place [using a service token](/docs/dbt-cloud-apis/service-tokens). dbt Cloud admin users can generate a Metadata Only service token that is authorized to execute a specific query against the Discovery API. + +Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Discovery API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used instead of `Token` in the Authorization header. Both syntaxes are equivalent. + +## Access the Discovery API + +1. Create a [service account token](/docs/dbt-cloud-apis/service-tokens) to authorize requests. dbt Cloud Admin users can generate a _Metadata Only_ service token, which can be used to execute a specific query against the Discovery API to authorize requests. + +2. Find your API URL using the endpoint `https://metadata.{YOUR_ACCESS_URL}/graphql`. + + * Replace `{YOUR_ACCESS_URL}` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your multi-tenant region is North America, your endpoint is `https://metadata.cloud.getdbt.com/graphql`. If your multi-tenant region is EMEA, your endpoint is `https://metadata.emea.dbt.com/graphql`. + +3. For specific query points, refer to the [schema documentation](/docs/dbt-cloud-apis/discovery-schema-job). + + +## Run queries using HTTP requests + +You can run queries by sending a `POST` request to the `https://metadata.YOUR_ACCESS_URL/graphql` endpoint, making sure to replace: +* `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. +* `YOUR_TOKEN` in the Authorization header with your actual API token. Be sure to include the Token prefix. +* `QUERY_BODY` with a GraphQL query, for example `{ "query": "" }` +* `VARIABLES` with a dictionary of your GraphQL query variables, such as a job ID or a filter. +* `ENDPOINT` with the endpoint you're querying, such as environment. + + ```shell + curl 'https://metadata.YOUR_ACCESS_URL/graphql' \ + -H 'authorization: Bearer YOUR_TOKEN' \ + -H 'content-type: application/json' + -X POST + --data QUERY_BODY + ``` + +Python example: + +```python +response = requests.post( + 'YOUR_ACCESS_URL', + headers={"authorization": "Bearer "+YOUR_TOKEN, "content-type": "application/json"}, + json={"query": QUERY_BODY, "variables": VARIABLES} +) + +metadata = response.json()['data'][ENDPOINT] +``` + +Every query will require an environment ID or job ID. You can get the ID from a dbt Cloud URL or using the Admin API. + +There are several illustrative example queries on this page. For more examples, refer to [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples). + + +## Reasonable use + +Discovery (GraphQL) API usage is subject to request rate and response size limits to maintain the performance and stability of the metadata platform and prevent abuse. + +Job-level endpoints are subject to query complexity limits. Nested nodes (like parents), code (like rawCode), and catalog columns are considered as most complex. Overly complex queries should be broken up into separate queries with only necessary fields included. dbt Labs recommends using the environment endpoint instead for most use cases to get the latest descriptive and result metadata for a dbt Cloud project. + +## Retention limits +You can use the Discovery API to query data from the previous three months. For example, if today was April 1st, you could query data back to January 1st. + +## Run queries with the GraphQL explorer + +You can run ad-hoc queries directly in the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and use the document explorer on the left-hand side to see all possible nodes and fields. + +Refer to the [Apollo explorer documentation](https://www.apollographql.com/docs/graphos/explorer/explorer) for setup and authorization info. + +1. Access the [GraphQL API explorer](https://metadata.cloud.getdbt.com/graphql) and select fields you want to query. + +2. Select **Variables** at the bottom of the explorer and replace any `null` fields with your unique values. + +3. [Authenticate](https://www.apollographql.com/docs/graphos/explorer/connecting-authenticating#authentication) using Bearer auth with `YOUR_TOKEN`. Select **Headers** at the bottom of the explorer and select **+New header**. + +4. Select **Authorization** in the **header key** dropdown list and enter your Bearer auth token in the **value** field. Remember to include the Token prefix. Your header key should be in this format: `{"Authorization": "Bearer }`. + + + + +
          + + + +1. Run your query by clicking the blue query button in the top right of the **Operation** editor (to the right of the query). You should see a successful query response on the right side of the explorer. + + + + + + +### Fragments + +Use the [`... on`](https://www.apollographql.com/docs/react/data/fragments/) notation to query across lineage and retrieve results from specific node types. + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" }) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } + } + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } + } + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + } + } + } + } + } + } +} +``` + +### Pagination + +Querying large datasets can impact performance on multiple functions in the API pipeline. Pagination eases the burden by returning smaller data sets one page at a time. This is useful for returning a particular portion of the dataset or the entire dataset piece-by-piece to enhance performance. dbt Cloud utilizes cursor-based pagination, which makes it easy to return pages of constantly changing data. + +Use the `PageInfo` object to return information about the page. The available fields are: + +- `startCursor` string type — Corresponds to the first `node` in the `edge`. +- `endCursor` string type — Corresponds to the last `node` in the `edge`. +- `hasNextPage` boolean type — Whether or not there are more `nodes` after the returned results. + +There are connection variables available when making the query: + +- `first` integer type — Returns the first n `nodes` for each page, up to 500. +- `after` string type — Sets the cursor to retrieve `nodes` after. It's best practice to set the `after` variable with the object ID defined in the `endCursor` of the previous page. + +Below is an example that returns the `first` 500 models `after` the specified Object ID in the variables. The `PageInfo` object returns where the object ID where the cursor starts, where it ends, and whether there is a next page. + + + + + + +Below is a code example of the `PageInfo` object: + +```graphql +pageInfo { + startCursor + endCursor + hasNextPage +} +totalCount # Total number of records across all pages +``` + +### Filters + +Filtering helps to narrow down the results of an API query. If you want to query and return only models and tests that are failing or find models that are taking too long to run, you can fetch execution details such as [`executionTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), [`runElapsedTime`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields), or [`status`](/docs/dbt-cloud-apis/discovery-schema-job-models#fields). This helps data teams monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline. + +Below is an example that filters for results of models that have succeeded on their `lastRunStatus`: + + + +Below is an example that filters for models that have an error on their last run and tests that have failed: + + + + +```graphql +query ModelsAndTests($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { lastRunStatus: error }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + tests(first: $first, filter: { status: "fail" }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + +## Related content + +- [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) +- [Schema](/docs/dbt-cloud-apis/discovery-schema-job) diff --git a/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md new file mode 100644 index 00000000000..8efb1ec0d37 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/discovery-use-cases-and-examples.md @@ -0,0 +1,1176 @@ +--- +title: "Use cases and examples for the Discovery API" +sidebar_label: "Uses and examples" +--- + +With the Discovery API, you can query the metadata in dbt Cloud to learn more about your dbt deployments and the data it generates to analyze them and make improvements. + +You can use the API in a variety of ways to get answers to your business questions. Below describes some of the uses of the API and is meant to give you an idea of the questions this API can help you answer. + +| Use Case | Outcome | Example Questions | +| --- | --- | --- | +| [Performance](#performance) | Identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. |
          • What’s the latest status of each model?
          • Do I need to run this model?
          • How long did my DAG take to run?
          | +| [Quality](#quality) | Monitor data source freshness and test results to resolve issues and drive trust in data. |
          • How fresh are my data sources?
          • Which tests and models failed?
          • What’s my project’s test coverage?
          | +| [Discovery](#discovery) | Find and understand relevant datasets and semantic nodes with rich context and metadata. |
          • What do these tables and columns mean?
          • What’s the full data lineage?
          • Which metrics can I query?
          | +| [Governance](#governance) | Audit data development and facilitate collaboration within and between teams. |
          • Who is responsible for this model?
          • How do I contact the model’s owner?
          • Who can use this model?
          | +| [Development](#development) | Understand dataset changes and usage and gauge impacts to inform project definition. |
          • How is this metric used in BI tools?
          • Which nodes depend on this data source?
          • How has a model changed? What impact?
          | + +## Performance + +You can use the Discovery API to identify inefficiencies in pipeline execution to reduce infrastructure costs and improve timeliness. Below are example questions and queries you can run. + +For performance use cases, people typically query the historical or latest applied state across any part of the DAG (for example, models) using the `environment`, `modelByEnvironment`, or job-level endpoints. + +### How long did each model take to run? + +It’s helpful to understand how long it takes to build models (tables) and tests to execute during a dbt run. Longer model build times result in higher infrastructure costs and fresh data arriving later to stakeholders. Analyses like these can be in observability tools or ad-hoc queries, like in a notebook. + + + +
          +Example query with code + +Data teams can monitor the performance of their models, identify bottlenecks, and optimize the overall data pipeline by fetching execution details like `executionTime` and `runElapsedTime`: + +1. Use latest state environment-level API to get a list of all executed models and their execution time. Then, sort the models by `executionTime` in descending order. + +```graphql +query AppliedModels($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + name + uniqueId + materializedType + executionInfo { + lastSuccessRunId + executionTime + executeStartedAt + } + } + } + } + } + } +} +``` + +2. Get the most recent 20 run results for the longest running model. Review the results of the model across runs or you can go to the job/run or commit itself to investigate further. + +```graphql +query ModelHistoricalRuns( + $environmentId: BigInt! + $uniqueId: String + $lastRunCount: Int +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + ) { + name + runId + runElapsedTime + runGeneratedAt + executionTime + executeStartedAt + executeCompletedAt + status + } + } + } +} +``` + +3. Use the query results to plot a graph of the longest running model’s historical run time and execution time trends. + + +```python +# Import libraries +import os +import matplotlib.pyplot as plt +import pandas as pd +import requests + +# Set API key +auth_token = *[SERVICE_TOKEN_HERE]* + +# Query the API +def query_discovery_api(auth_token, gql_query, variables): + response = requests.post('https://metadata.cloud.getdbt.com/graphql', + headers={"authorization": "Bearer "+auth_token, "content-type": "application/json"}, + json={"query": gql_query, "variables": variables}) + data = response.json()['data'] + + return data + +# Get the latest run metadata for all models +models_latest_metadata = query_discovery_api(auth_token, query_one, variables_query_one)['environment'] + +# Convert to dataframe +models_df = pd.DataFrame([x['node'] for x in models_latest_metadata['applied']['models']['edges']]) + +# Unnest the executionInfo column +models_df = pd.concat([models_df.drop(['executionInfo'], axis=1), models_df['executionInfo'].apply(pd.Series)], axis=1) + +# Sort the models by execution time +models_df_sorted = models_df.sort_values('executionTime', ascending=False) + +print(models_df_sorted) + +# Get the uniqueId of the longest running model +longest_running_model = models_df_sorted.iloc[0]['uniqueId'] + +# Define second query variables +variables_query_two = { + "environmentId": *[ENVR_ID_HERE]* + "lastRunCount": 10, + "uniqueId": longest_running_model +} + +# Get the historical run metadata for the longest running model +model_historical_metadata = query_discovery_api(auth_token, query_two, variables_query_two)['environment']['applied']['modelHistoricalRuns'] + +# Convert to dataframe +model_df = pd.DataFrame(model_historical_metadata) + +# Filter dataframe to only successful runs +model_df = model_df[model_df['status'] == 'success'] + +# Convert the runGeneratedAt, executeStartedAt, and executeCompletedAt columns to datetime +model_df['runGeneratedAt'] = pd.to_datetime(model_df['runGeneratedAt']) +model_df['executeStartedAt'] = pd.to_datetime(model_df['executeStartedAt']) +model_df['executeCompletedAt'] = pd.to_datetime(model_df['executeCompletedAt']) + +# Plot the runElapsedTime over time +plt.plot(model_df['runGeneratedAt'], model_df['runElapsedTime']) +plt.title('Run Elapsed Time') +plt.show() + +# # Plot the executionTime over time +plt.plot(model_df['executeStartedAt'], model_df['executionTime']) +plt.title(model_df['name'].iloc[0]+" Execution Time") +plt.show() +``` + +Plotting examples: + + + + + + +
          + +### What’s the latest state of each model? + +The Discovery API provides information about the applied state of models and how they arrived in that state. You can retrieve the status information from the most recent run and most recent successful run (execution) from the `environment` endpoint and dive into historical runs using job-based and `modelByEnvironment` endpoints. + +
          +Example query + +The API returns full identifier information (`database.schema.alias`) and the `executionInfo` for both the most recent run and most recent successful run from the database: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + compiledCode + database + schema + alias + materializedType + executionInfo { + executeCompletedAt + lastJobDefinitionId + lastRunGeneratedAt + lastRunId + lastRunStatus + lastRunError + lastSuccessJobDefinitionId + runGeneratedAt + lastSuccessRunId + } + } + } + } + } + } +} +``` + +
          + +### What happened with my job run? + +You can query the metadata at the job level to review results for specific runs. This is helpful for historical analysis of deployment performance or optimizing particular jobs. + +import DiscoveryApiJobDeprecationNotice from '/snippets/_discovery_api_job_deprecation_notice.md'; + + + +
          +Example query + +Deprecated example: +```graphql +query ($jobId: Int!, $runId: Int!) { + models(jobId: $jobId, runId: $runId) { + name + status + tests { + name + status + } + } +} +``` + +New example: + +```graphql +query ($jobId: BigInt!, $runId: BigInt!) { + job(id: $jobId, runId: $runId) { + models { + name + status + tests { + name + status + } + } + } +} +``` + +
          + +### What’s changed since the last run? +Unnecessary runs incur higher infrastructure costs and load on the data team and their systems. A model doesn’t need to be run if it’s a view and there's no code change since the last run, or if it’s a table/incremental with no code change since last run and source data has not been updated since the last run. + +
          +Example query + +With the API, you can compare the `rawCode` between the definition and applied state, and review when the sources were last loaded (source `maxLoadedAt` relative to model `executeCompletedAt`) given the `materializedType` of the model: + + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + ancestors(types: [Source]) { + ... on SourceAppliedStateNestedNode { + freshness { + maxLoadedAt + } + } + } + executionInfo { + runGeneratedAt + executeCompletedAt + } + materializedType + } + } + } + } + definition { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + rawCode + runGeneratedAt + materializedType + } + } + } + } + } +} +``` + +
          + +## Quality + +You can use the Discovery API to monitor data source freshness and test results to diagnose and resolve issues and drive trust in data. When used with [webhooks](/docs/deploy/webhooks), can also help with detecting, investigating, and alerting issues. Below lists example questions the API can help you answer. Below are example questions and queries you can run. + +For quality use cases, people typically query the historical or latest applied state, often in the upstream part of the DAG (for example, sources), using the `environment` or `environment { applied { modelHistoricalRuns } }` endpoints. + +### Which models and tests failed to run? + +By filtering on the latest status, you can get lists of models that failed to build and tests that failed during their most recent execution. This is helpful when diagnosing issues with the deployment that result in delayed or incorrect data. + +
          +Example query with code + +1. Get the latest run results across all jobs in the environment and return only the models and tests that errored/failed. + + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { lastRunStatus: error }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + tests(first: $first, filter: { status: "fail" }) { + edges { + node { + name + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + +2. Review the historical execution and test failure rate (up to 20 runs) for a given model, such as a frequently used and important dataset. + + +```graphql +query ($environmentId: BigInt!, $uniqueId: String!, $lastRunCount: Int) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId, lastRunCount: $lastRunCount) { + name + executeStartedAt + status + tests { + name + status + } + } + } + } +} +``` + +3. Identify the runs and plot the historical trends of failure/error rates. + + +
          + + +### When was the data my model uses last refreshed? + +You can get the metadata on the latest execution for a particular model or across all models in your project. For instance, investigate when each model or snapshot that's feeding into a given model was last executed or the source or seed was last loaded to gauge the _freshness_ of the data. + +
          +Example query with code + + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { uniqueIds: "MODEL.PROJECT.MODEL_NAME" } + ) { + edges { + node { + name + ancestors(types: [Model, Source, Seed, Snapshot]) { + ... on ModelAppliedStateNestedNode { + name + resourceType + materializedType + executionInfo { + executeCompletedAt + } + } + ... on SourceAppliedStateNestedNode { + sourceName + name + resourceType + freshness { + maxLoadedAt + } + } + ... on SnapshotAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + ... on SeedAppliedStateNestedNode { + name + resourceType + executionInfo { + executeCompletedAt + } + } + } + } + } + } + } + } +} +``` + + +```python +# Extract graph nodes from response +def extract_nodes(data): + models = [] + sources = [] + groups = [] + for model_edge in data["applied"]["models"]["edges"]: + models.append(model_edge["node"]) + for source_edge in data["applied"]["sources"]["edges"]: + sources.append(source_edge["node"]) + for group_edge in data["definition"]["groups"]["edges"]: + groups.append(group_edge["node"]) + models_df = pd.DataFrame(models) + sources_df = pd.DataFrame(sources) + groups_df = pd.DataFrame(groups) + + return models_df, sources_df, groups_df + +# Construct a lineage graph with freshness info +def create_freshness_graph(models_df, sources_df): + G = nx.DiGraph() + current_time = datetime.now(timezone.utc) + for _, model in models_df.iterrows(): + max_freshness = pd.Timedelta.min + if "meta" in models_df.columns: + freshness_sla = model["meta"]["freshness_sla"] + else: + freshness_sla = None + if model["executionInfo"]["executeCompletedAt"] is not None: + model_freshness = current_time - pd.Timestamp(model["executionInfo"]["executeCompletedAt"]) + for ancestor in model["ancestors"]: + if ancestor["resourceType"] == "SourceAppliedStateNestedNode": + ancestor_freshness = current_time - pd.Timestamp(ancestor["freshness"]['maxLoadedAt']) + elif ancestor["resourceType"] == "ModelAppliedStateNestedNode": + ancestor_freshness = current_time - pd.Timestamp(ancestor["executionInfo"]["executeCompletedAt"]) + + if ancestor_freshness > max_freshness: + max_freshness = ancestor_freshness + + G.add_node(model["uniqueId"], name=model["name"], type="model", max_ancestor_freshness = max_freshness, freshness = model_freshness, freshness_sla=freshness_sla) + for _, source in sources_df.iterrows(): + if source["maxLoadedAt"] is not None: + G.add_node(source["uniqueId"], name=source["name"], type="source", freshness=current_time - pd.Timestamp(source["maxLoadedAt"])) + for _, model in models_df.iterrows(): + for parent in model["parents"]: + G.add_edge(parent["uniqueId"], model["uniqueId"]) + + return G +``` + +Graph example: + + + +
          + + +### Are my data sources fresh? + +Checking [source freshness](/docs/build/sources#snapshotting-source-data-freshness) allows you to ensure that sources loaded and used in your dbt project are compliant with expectations. The API provides the latest metadata about source loading and information about the freshness check criteria. + + + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { freshnessChecked: true, database: "production" } + ) { + edges { + node { + sourceName + name + identifier + loader + freshness { + freshnessJobDefinitionId + freshnessRunId + freshnessRunGeneratedAt + freshnessStatus + freshnessChecked + maxLoadedAt + maxLoadedAtTimeAgoInS + snapshottedAt + criteria { + errorAfter { + count + period + } + warnAfter { + count + period + } + } + } + } + } + } + } + } +} +``` + +
          + +### What’s the test coverage and status? + +[Tests](https://docs.getdbt.com/docs/build/tests) are an important way to ensure that your stakeholders are reviewing high-quality data. You can execute tests during a dbt Cloud run. The Discovery API provides complete test results for a given environment or job, which it represents as the `children` of a given node that’s been tested (for example, a `model`). + +
          +Example query + +For the following example, the `parents` are the nodes (code) that's being tested and `executionInfo` describes the latest test results: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + tests(first: $first) { + edges { + node { + name + columnName + parents { + name + resourceType + } + executionInfo { + lastRunStatus + lastRunError + executeCompletedAt + executionTime + } + } + } + } + } + } +} +``` + +
          + + + +### How is this model contracted and versioned? + +To enforce the shape of a model's definition, you can define contracts on models and their columns. You can also specify model versions to keep track of discrete stages in its evolution and use the appropriate one. + + + +
          +Example query + + +```graphql +query { + environment(id: 123) { + applied { + models(first: 100, filter: { access: public }) { + edges { + node { + name + latestVersion + contractEnforced + constraints { + name + type + expression + columns + } + catalog { + columns { + name + type + } + } + } + } + } + } + } +} +``` + +
          + +
          + +## Discovery + +You can use the Discovery API to find and understand relevant datasets and semantic nodes with rich context and metadata. Below are example questions and queries you can run. + +For discovery use cases, people typically query the latest applied or definition state, often in the downstream part of the DAG (for example, mart models or metrics), using the `environment` endpoint. + +### What does this dataset and its columns mean? + +Query the Discovery API to map a table/view in the data platform to the model in the dbt project; then, retrieve metadata about its meaning, including descriptive metadata from its YAML file and catalog information from its YAML file and the schema. + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models( + first: $first + filter: { + database: "analytics" + schema: "prod" + identifier: "customers" + } + ) { + edges { + node { + name + description + tags + meta + catalog { + columns { + name + description + type + } + } + } + } + } + } + } +} +``` +
          + + + + + +### Which metrics are available? + +You can define and query metrics using the [dbt Semantic Layer](/docs/build/about-metricflow), use them for documentation purposes (like for a data catalog), and calculate aggregations (like in a BI tool that doesn’t query the SL). To learn more, refer to [Get started with MetricFlow](/docs/build/sl-getting-started). + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + metrics(first: $first) { + edges { + node { + name + description + type + formula + filter + tags + parents { + name + resourceType + } + } + } + } + } + } +} +``` + +
          + +
          + + + +## Governance + +You can use the Discovery API to audit data development and facilitate collaboration within and between teams. + +For governance use cases, people tend to query the latest definition state, often in the downstream part of the DAG (for example, public models), using the `environment` endpoint. + +### Who is responsible for this model? + +You can define and surface the groups each model is associated with. Groups contain information like owner. This can help you identify which team owns certain models and who to contact about them. + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first, filter: { uniqueIds: ["MODEL.PROJECT.NAME"] }) { + edges { + node { + name + description + resourceType + access + group + } + } + } + } + definition { + groups(first: $first) { + edges { + node { + name + resourceType + models { + name + } + ownerName + ownerEmail + } + } + } + } + } +} +``` +
          + +### Who can use this model? + +You can enable people the ability to specify the level of access for a given model. In the future, public models will function like APIs to unify project lineage and enable reuse of models using cross-project refs. + + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first) { + edges { + node { + name + access + } + } + } + } + } +} +``` + +--- + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first, filter: { access: public }) { + edges { + node { + name + } + } + } + } + } +} +``` +
          + +
          + +## Development + +You can use the Discovery API to understand dataset changes and usage and gauge impacts to inform project definition. Below are example questions and queries you can run. + +For development use cases, people typically query the historical or latest definition or applied state across any part of the DAG using the `environment` endpoint. + +### How is this model or metric used in downstream tools? +[Exposures](/docs/build/exposures) provide a method to define how a model or metric is actually used in dashboards and other analytics tools and use cases. You can query an exposure’s definition to see how project nodes are used and query its upstream lineage results to understand the state of the data used in it, which powers use cases like a freshness and quality status tile. + + + + +
          +Example query + +Below is an example that reviews an exposure and the models used in it including when they were last executed. + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + exposures(first: $first) { + edges { + node { + name + description + ownerName + url + parents { + name + resourceType + ... on ModelAppliedStateNestedNode { + executionInfo { + executeCompletedAt + lastRunStatus + } + } + } + } + } + } + } + } +} +``` +
          + +### How has this model changed over time? + +The Discovery API provides historical information about any resource in your project. For instance, you can view how a model has evolved over time (across recent runs) given changes to its shape and contents. + +
          +Example query + +Review the differences in `compiledCode` or `columns` between runs or plot the “Approximate Size” and “Row Count” `stats` over time: + +```graphql +query ( + $environmentId: BigInt! + $uniqueId: String! + $lastRunCount: Int! + $withCatalog: Boolean! +) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns( + uniqueId: $uniqueId + lastRunCount: $lastRunCount + withCatalog: $withCatalog + ) { + name + compiledCode + columns { + name + } + stats { + label + value + } + } + } + } +} +``` +
          + +### Which nodes depend on this data source? + +dbt lineage begins with data sources. For a given source, you can look at which nodes are its children then iterate downstream to get the full list of dependencies. + +Currently, querying beyond 1 generation (defined as a direct parent-to-child) is not supported. To see the grandchildren of a node, you need to make two queries: one to get the node and its children, and another to get the children nodes and their children. + +
          +Example query + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + sources( + first: $first + filter: { uniqueIds: ["SOURCE_NAME.TABLE_NAME"] } + ) { + edges { + node { + loader + children { + uniqueId + resourceType + ... on ModelAppliedStateNestedNode { + database + schema + alias + } + } + } + } + } + } + } +} +``` +
          + +## Related docs + +- [Query Discovery API](/docs/dbt-cloud-apis/discovery-querying) diff --git a/website/docs/docs/dbt-cloud-apis/metadata-api.md b/website/docs/docs/dbt-cloud-apis/metadata-api.md deleted file mode 100644 index 169afbb225f..00000000000 --- a/website/docs/docs/dbt-cloud-apis/metadata-api.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Metadata API" -id: "metadata-api" ---- - -## About the metadata API - -Every time that dbt Cloud runs a dbt project, it generates metadata which pertains to the accuracy, recency, configuration, and structure of the views and tables in the warehouse. dbt Cloud serves a GraphQL API which supports arbitrary queries over this metadata; the endpoint for this API is `https://metadata.cloud.getdbt.com/graphql`. This API is an incredibly rich resource for evaluating data health longitudinally or at a point in time. - -## Prerequisites - -The metadata API is available to accounts on the _Team_ and _Enterprise_ plans, for any version >= dbt v0.19.0. Please note that artifacts generated with a version of dbt _less than_ v0.19.0 will not be accessible via the dbt Cloud metadata API. For information on upgrading, see "[Version migration guides](https://docs.getdbt.com/guides/migration/versions)." - -## How to browse the API - -We also provide a [graphical explorer](https://metadata.cloud.getdbt.com/graphiql) for this API where you can run ad-hoc queries or browse the schema. As GraphQL provides a self-describing API, the schema shown in the GraphiQL interface is an accurate representation of the graph and fields available to query on at any point in time. - -## Retention limits for data - -You can use the metadata API to query data from the previous 3 months. For example, if today was April 1, you could query data back to January 1st. - -*We are continuously expanding the capabilities of the metadata API and we welcome your feedback and suggestions at metadata@dbtlabs.com.* diff --git a/website/docs/docs/dbt-cloud-apis/metadata-querying.md b/website/docs/docs/dbt-cloud-apis/metadata-querying.md deleted file mode 100644 index 297d76e58e1..00000000000 --- a/website/docs/docs/dbt-cloud-apis/metadata-querying.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: "Query the Metadata API" -id: "metadata-querying" ---- -Accounts on the _Team_ and _Multi-Tenant Enterprise_ plans can query the dbt Metadata API. - -## Authorization - -Currently, authorization of requests takes place [using a service token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). dbt Cloud admin users can generate a Metadata Only service token that is authorized to execute a specific query against the Metadata API. - -Once you've created a token, you can use it in the Authorization header of requests to the dbt Cloud Metadata API. Be sure to include the Token prefix in the Authorization header, or the request will fail with a `401 Unauthorized` error. Note that `Bearer` can be used in place of `Token` in the Authorization header. Both syntaxes are equivalent. - -## Running Queries - -You can run queries by sending a `POST` request to the https://metadata.cloud.getdbt.com/graphql endpoint. Be sure to replace your token in the Authorization header with your actual API token. - -``` -curl 'https://metadata.cloud.getdbt.com/graphql' \ - -H 'authorization: Bearer ' \ - -H ‘content-type: application/json’ - -X POST - --data -``` - -The `` body should be a JSON string in the format: - -``` -{ “query”: “” } -``` - -Every query will rely on a *jobID*. You can get the jobID by clicking into the relevant job in dbt Cloud and observing the URL. In this example URL, the jobID would be 917: `https://cloud.getdbt.com/#/accounts/1/projects/665/jobs/917/` - -There are several illustrative example queries in this documentation (examples of queries on the Model node, [here](/docs/dbt-cloud-apis/metadata-schema-model). - -## GraphiQL -You can also experiment and run queries directly in the [GraphiQL interface](https://metadata.cloud.getdbt.com/graphiql), which is convenient for exploration. On the right hand side, there is a document explorer where you can see all possible nodes and fields. Below is an example of what a query looks like in GraphiQL. Note that you must authenticate via bearer auth with your token. - - diff --git a/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md b/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md new file mode 100644 index 00000000000..3e6ac2c3577 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/migrating-to-v2.md @@ -0,0 +1,32 @@ +--- +title: "Migrating to dbt Cloud Administrative API v2" +description: "You should migrate to API v2 while we deprecate API v4 " +sidebar_label: "Migrating to API v2" +id: "migrating-to-v2" +--- + + +In an attempt to provide an improved dbt Cloud Administrative API experience, the dbt Cloud API v4 will be deprecated by April 30th, 2023. We suggest you migrate to dbt Cloud Administrative API v2. When migrating from API v4 to API v2, there are a few differences you should consider when querying your dbt Cloud account. + +## Key differences + +When using the [List runs](/dbt-cloud/api-v2-legacy#tag/Runs) endpoint, you can include triggered runs and sort by ID. You can use the following request in v2 to get a similar response as v4, replacing the `{accountId}` with your own and `{YOUR_ACCESS_URL}` with the appropriate [Access URL](https://docs.getdbt.com/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: + +```shell +GET https://{YOUR_ACCESS_URL}/api/v2/accounts/{accountId}/runs/?include_related=[%22trigger%22]&order_by=-id +``` +For example, if your region is EMEA multi-tenant and your account ID is `001`, your endpoint would be: + +```shell +GET https://emea.dbt.com/api/v2/accounts/001/runs/?include_related=[%22trigger%22]&order_by=-id` +``` + +Differences in responses include: + +| Property description | API v4 | API v2 | +|---------------------|-----------|-------------| +| Reverse sort order when you use sort by `-id` | Defaults to order by most recent | Defaults to order by least recent | +| Update to timestamps | Unix timestamps | ISO strings | +| Update to IDs: `id`, `environment_id`, `account_id`, `project_id`, `job_id` | Values are the same, but they are strings | Values are the same, but they are numeric | +| New property for returning runs with the specified status | `status` property | Maps to `status_humanized` | +| New property for including related field with run | `replace` property | Maps to the `trigger` property | diff --git a/website/docs/docs/dbt-cloud-apis/project-state.md b/website/docs/docs/dbt-cloud-apis/project-state.md new file mode 100644 index 00000000000..a5ee71ebb1b --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/project-state.md @@ -0,0 +1,85 @@ +--- +title: "Project state in dbt Cloud" +--- + +dbt Cloud provides a stateful way of deploying dbt. Artifacts are accessible programmatically via the [Discovery API](/docs/dbt-cloud-apis/discovery-querying) in the metadata platform. + +With the implementation of the `environment` endpoint in the Discovery API, we've introduced the idea of multiple states. The Discovery API provides a single API endpoint that returns the latest state of models, sources, and other nodes in the DAG. + +A single [deployment environment](/docs/environments-in-dbt) should represent the production state of a given dbt Cloud project. + +There are two states that can be queried in dbt Cloud: + +- **Applied state** refers to what exists in the data warehouse after a successful `dbt run`. The model build succeeds and now exists as a table in the warehouse. + +- **Definition state** depends on what exists in the project given the code defined in it (for example, manifest state), which hasn’t necessarily been executed in the data platform (maybe just the result of `dbt compile`). + +### Definition (logical) vs. applied state of dbt nodes + +In a dbt project, the state of a node _definition_ represents the configuration, transformations, and dependencies defined in the SQL and YAML files. It captures how the node should be processed in relation to other nodes and tables in the data warehouse and may be produced by a `dbt build`, `run`, `parse`, or `compile`. It changes whenever the project code changes. + +A node’s _applied state_ refers to the node’s actual state after it has been successfully executed in the DAG; for example, models are executed; thus, their state is applied to the data warehouse via `dbt run` or `dbt build`. It changes whenever a node is executed. This state represents the result of the transformations and the actual data stored in the database, which for models can be a table or a view based on the defined logic. + +The applied state includes execution info, which contains metadata about how the node arrived in the applied state: the most recent execution (successful or attempted), such as when it began, its status, and how long it took. + +Here’s how you’d query and compare the definition vs. applied state of a model using the Discovery API: + +```graphql +query Compare($environmentId: Int!, $first: Int!) { + environment(id: $environmentId) { + definition { + models(first: $first) { + edges { + node { + name + rawCode + } + } + } + } + applied { + models(first: $first) { + edges { + node { + name + rawCode + executionInfo { + executeCompletedAt + } + } + } + } + } + } +} + +``` + +Most Discovery API use cases will favor the _applied state_ since it pertains to what has actually been run and can be analyzed. + +### Affected states by node type + +| Node | Executed in DAG | Created by execution | Exists in database | Lineage | States | +|-----------|------------------|----------------------|--------------------|-----------------------|----------------------| +| Model | Yes | Yes | Yes | Upstream & downstream | Applied & definition | +| Source | Yes | No | Yes | Downstream | Applied & definition | +| Seed | Yes | Yes | Yes | Downstream | Applied & definition | +| Snapshot | Yes | Yes | Yes | Upstream & downstream | Applied & definition | +| Test | Yes | Yes | No | Upstream | Applied & definition | +| Exposure | No | No | No | Upstream | Applied & definition | +| Metric | No | No | No | Upstream & downstream | Definition | +| Semantic model | No | No | No | Upstream & downstream | Definition | +| Group | No | No | No | Downstream | Definition | +| Macro | Yes | No | No | N/A | Definition | + + ### Caveats about state/metadata updates + +Over time, Cloud Artifacts will provide information to maintain state for features/services in dbt Cloud and enable you to access state in dbt Cloud and its downstream ecosystem. Cloud Artifacts is currently focused on the latest production state, but this focus will evolve. + +Here are some limitations of the state representation in the Discovery API: + +- Users must access the default production environment to know the latest state of a project. +- The API gets the definition from the latest manifest generated in a given deployment environment, but that often won’t reflect the latest project code state. +- Compiled code results may be outdated depending on dbt Cloud run step order and failures. +- Catalog info can be outdated, or incomplete (in the applied state), based on if/when docs were last generated. +- Source freshness checks can be out of date (in the applied state) depending on when the command was last run, and it’s not included in `build`. diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx new file mode 100644 index 00000000000..d1463f9e9b7 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment-applied-modelHistoricalRuns.mdx @@ -0,0 +1,50 @@ +--- +title: "Model Historical Runs object schema" +sidebar_label: "Model historical runs" +id: "discovery-schema-environment-applied-modelHistoricalRuns" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model historical runs object allows you to query information about a model's run history. + +The [Example query](#example-query) illustrates a few fields you can query with the `modelHistoricalRuns` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `modelHistoricalRuns`, you can use the following arguments: + + + +### Example query + +You can use the `environmentId` and the model's `uniqueId` to return the model and its execution time for the last 20 times it was run, regardless of which job ran it. + +```graphql +query { + environment(id: 834) { + applied { + modelHistoricalRuns( + uniqueId: "model.marketing.customers" + lastRunCount: 20 + ) { + runId # Get historical results for a particular model + runGeneratedAt + executionTime # View build time across runs + status + tests { + name + status + executeCompletedAt + } # View test results across runs + } + } + } +} +``` + +### Fields + +When querying for `modelHistoricalRuns`, you can use the following fields: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx new file mode 100644 index 00000000000..a82bba6576d --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-environment.mdx @@ -0,0 +1,104 @@ +--- +title: "Environment object schema" +sidebar_label: "Environment" +id: "discovery-schema-environment" +--- + +import { QueryArgsTable, SchemaTable } from "./schema"; + + + +The environment object allows you to query information about a particular model based on `environmentId`. + +The [Example queries](#example-queries) illustrate a few fields you can query with this `environment` object. Refer to [Fields](#fields) to view the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `environment`, you can use the following arguments. + + + +:::caution + +dbt Labs is making changes to the Discovery API. These changes will take effect on August 15, 2023. + +The data type `Int` for `id` is being deprecated and will be replaced with `BigInt`. When the time comes, you will need to update your API call accordingly to avoid errors. +::: + +### Example queries + +You can use your production environment's `id`: + +```graphql +query Example { + environment(id: 834){ # Get the latest state of the production environment + applied { # The state of an executed node as it exists as an object in the database + models(first: 100){ # Pagination to ensure manageable response for large projects + edges { node { + uniqueId, name, description, rawCode, compiledCode, # Basic properties + database, schema, alias, # Table/view identifier (can also filter by) + executionInfo {executeCompletedAt, executionTime}, # Metadata from when the model was built + tests {name, executionInfo{lastRunStatus, lastRunError}}, # Latest test results + catalog {columns {name, description, type}, stats {label, value}}, # Catalog info + ancestors(types:[Source]) {name, ...on SourceAppliedStateNode {freshness{maxLoadedAt, freshnessStatus}}}, # Source freshness } + children {name, resourceType}}} # Immediate dependencies in lineage + totalCount } # Number of models in the project + } + definition { # The logical state of a given project node given its most recent manifest generated + models(first: 100, filter:{access:public}){ # Filter on model access (or other properties) + edges { node { + rawCode, # Compare to see if/how the model has changed since the last build + jobDefinitionId, runGeneratedAt, # When the code was last compiled or run + contractEnforced, group, version}}} # Model governance + } + } +``` + +With the deprecation of the data type `Int` for `id`, below is an example of replacing it with `BigInt`: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} + +``` + +With the deprecation of `modelByEnvironment`, below is an example of replacing it with `environment`: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + +### Fields +When querying an `environment`, you can use the following fields. + + + +When querying the `applied` field of `environment`, you can use the following fields. + + +When querying the `definition` field of `environment`, you can use the following fields. + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx new file mode 100644 index 00000000000..58855659d05 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposure.mdx @@ -0,0 +1,64 @@ +--- +title: "Exposure object schema" +sidebar_label: "Exposure" +id: "discovery-schema-job-exposure" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposure object allows you to query information about a particular exposure. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + +### Arguments + +When querying for an `exposure`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposure object. + +### Example query + +The example below queries information about an exposure including the owner's name and email, the URL, and information about parent sources and parent models. + +```graphql +{ + job(id: 123) { + exposure(name: "my_awesome_exposure") { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for an `exposure`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx new file mode 100644 index 00000000000..b4fe027e324 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-exposures.mdx @@ -0,0 +1,65 @@ +--- +title: "Exposures object schema" +sidebar_label: "Exposures" +id: "discovery-schema-job-exposures" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The exposures object allows you to query information about all exposures in a given job. To learn more, refer to [Add Exposures to your DAG](/docs/build/exposures). + + +### Arguments + +When querying for `exposures`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the exposures object. + +### Example query + +The example below queries information about all exposures in a given job including the owner's name and email, the URL, and information about parent sources and parent models for each exposure. + +```graphql +{ + job(id: 123) { + exposures(jobId: 123) { + runId + projectId + name + uniqueId + resourceType + ownerName + url + ownerEmail + parentsSources { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + parentsModels { + uniqueId + } + } + } +} +``` + +### Fields +When querying for `exposures`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx new file mode 100644 index 00000000000..3a8a52a19cb --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metric.mdx @@ -0,0 +1,58 @@ +--- +title: "Metric object schema" +sidebar_label: "Metric" +id: "discovery-schema-job-metric" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metric object allows you to query information about [metrics](/docs/build/metrics). + +### Arguments + +When querying for a `metric`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metric object. + +### Example query + +The example query below outputs information about a metric. You can also add any field from the model endpoint (the example simply selects name). This includes schema, database, uniqueId, columns, and more. For details, refer to [Model object schema](/docs/dbt-cloud-apis/discovery-schema-job-model). + + +```graphql +{ + job(id: 123) { + metric(uniqueId: "metric.jaffle_shop.new_customers") { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +When querying for a `metric`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx new file mode 100644 index 00000000000..174dd5b676a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-metrics.mdx @@ -0,0 +1,60 @@ +--- +title: "Metrics object schema" +sidebar_label: "Metrics" +id: "discovery-schema-job-metrics" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The metrics object allows you to query information about [metrics](/docs/build/metrics). + + +### Arguments + +When querying for `metrics`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the metrics object. + +### Example query + +The example query returns information about all metrics for the given job. + +```graphql +{ + job(id: 123) { + metrics { + uniqueId + name + packageName + tags + label + runId + description + type + sql + timestamp + timeGrains + dimensions + meta + resourceType + filters { + field + operator + value + } + model { + name + } + } + } +} +``` + +### Fields +The metrics object can access the _same fields_ as the [metric node](/docs/dbt-cloud-apis/discovery-schema-job-metric). The difference is that the metrics object can output a list so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. + +When querying for `metrics`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx new file mode 100644 index 00000000000..abd1ca1b1d6 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-model.mdx @@ -0,0 +1,91 @@ +--- +title: "Model object schema" +sidebar_label: "Model" +id: "discovery-schema-job-model" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The model object allows you to query information about a particular model in a given job. + +### Arguments + +When querying for a `model`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the model object. + +### Example query for finding parent models and sources + +The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. The jobID and uniqueID fields are placeholders that you will need to replace with your own values. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + parentsModels { + runId + uniqueId + executionTime + } + parentsSources { + runId + uniqueId + state + } + } + } +} + +``` + +### Example query for model timing + +The example query below could be useful if you want to understand information around execution timing on a given model (start, end, completion). + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + runId + projectId + name + uniqueId + resourceType + executeStartedAt + executeCompletedAt + executionTime + } + } +} +``` + +### Example query for column-level information + +You can use the following example query to understand more about the columns of a given model. This query will only work if the job has generated documentation; that is, it will work with the command `dbt docs generate`. + +```graphql +{ + job(id: 123) { + model(uniqueId: "model.jaffle_shop.dim_user") { + columns { + name + index + type + comment + description + tags + meta + } + } + } +} +``` + + +### Fields + +When querying for a `model`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx new file mode 100644 index 00000000000..ee512f3cd97 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-models.mdx @@ -0,0 +1,59 @@ +--- +title: "Models object schema" +sidebar_label: "Models" +id: "discovery-schema-job-models" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + + +The models object allows you to query information about all models in a given job. + +### Arguments + +When querying for `models`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the models object. + +### Example queries +The database, schema, and identifier arguments are all optional. This means that with this endpoint you can: + +- Find a specific model by providing `..` +- Find all of the models in a database and/or schema by providing `` and/or `` + +#### Find models by their database, schema, and identifier +The example query below finds a model by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + models(database:"analytics", schema: "analytics", identifier:"dim_customers") { + uniqueId + } + } +} +``` + +#### Find models by their schema +The example query below finds all models in this schema and their respective execution times. + +```graphql +{ + job(id: 123) { + models(schema: "analytics") { + uniqueId + executionTime + } + } +} +``` + + +### Fields +The models object can access the _same fields_ as the [Model node](/docs/dbt-cloud-apis/discovery-schema-job-model). The difference is that the models object can output a list so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, and so on. + +When querying for `models`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx new file mode 100644 index 00000000000..924e3e87e91 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seed.mdx @@ -0,0 +1,42 @@ +--- +title: "Seed object schema" +sidebar_label: "Seed" +id: "discovery-schema-job-seed" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seed object allows you to query information about a particular seed in a given job. + +### Arguments + +When querying for a `seed`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seed object. + +### Example query + +The example query below pulls relevant information about a given seed. For instance, you can view the load time. + +```graphql +{ + job(id: 123) { + seed(uniqueId: "seed.jaffle_shop.raw_customers") { + database + schema + uniqueId + name + status + error + } + } +} +``` + +### Fields + +When querying for a `seed`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx new file mode 100644 index 00000000000..6ed45216e5f --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-seeds.mdx @@ -0,0 +1,40 @@ +--- +title: "Seeds object schema" +sidebar_label: "Seeds" +id: "discovery-schema-job-seeds" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The seeds object allows you to query information about all seeds in a given job. + +### Arguments + +When querying for `seeds`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the seeds object. + +### Example query + +The example query below pulls relevant information about all seeds in a given job. For instance, you can view load times. + +```graphql +{ + job(id: 123) { + seeds { + uniqueId + name + executionTime + status + } + } +} +``` + +### Fields + +When querying for `seeds`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx new file mode 100644 index 00000000000..a57163e0554 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-snapshots.mdx @@ -0,0 +1,49 @@ +--- +title: "Snapshots object schema" +sidebar_label: "Snapshots" +id: "discovery-schema-job-snapshots" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The snapshots object allows you to query information about all snapshots in a given job. + +### Arguments + +When querying for `snapshots`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the snapshots object. + +### Example query + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific snapshot by providing `..` +- Find all of the snapshots in a database and/or schema by providing `` and/or `` + +#### Find snapshots information for a job + +The example query returns information about all snapshots in this job. + +```graphql +{ + job(id: 123) { + snapshots { + uniqueId + name + executionTime + environmentId + executeStartedAt + executeCompletedAt + } + } +} +``` + +### Fields + +When querying for `snapshots`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx new file mode 100644 index 00000000000..972e929f4cd --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-source.mdx @@ -0,0 +1,52 @@ +--- +title: "Source object schema" +sidebar_label: "Source" +id: "discovery-schema-job-source" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The source object allows you to query information about a particular source in a given job. + +### Arguments + +When querying for a `source`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the source object. + +### Example query + +The query below pulls relevant information about a given source. For instance, you can view the load time and the state (pass, fail, error) of that source. + +```graphql +{ + job(id: 123) { + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields + +When querying for a `source`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx new file mode 100644 index 00000000000..97f717d269a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-sources.mdx @@ -0,0 +1,65 @@ +--- +title: "Sources object schema" +sidebar_label: "Sources" +id: "discovery-schema-job-sources" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The sources object allows you to query information about all sources in a given job. + +### Arguments + +When querying for `sources`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema of the sources object. + +### Example queries + +The database, schema, and identifier arguments are optional. This means that with this endpoint you can: + +- Find a specific source by providing `..` +- Find all of the sources in a database and/or schema by providing `` and/or `` + +#### Finding sources by their database, schema, and identifier + +The example query below finds a source by its unique database, schema, and identifier. + +```graphql +{ + job(id: 123) { + sources( + database: "analytics" + schema: "analytics" + identifier: "dim_customers" + ) { + uniqueId + } + } +} +``` + +#### Finding sources by their schema + +The example query below finds all sources in this schema and their respective states (pass, error, fail). + +```graphql +{ + job(id: 123) { + sources(schema: "analytics") { + uniqueId + state + } + } +} +``` + +### Fields + +The sources object can access the _same fields_ as the [source node](/docs/dbt-cloud-apis/discovery-schema-job-source). The difference is that the sources object can output a list so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, and so on. + +When querying for `sources`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx new file mode 100644 index 00000000000..c52aa49ab93 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-test.mdx @@ -0,0 +1,43 @@ +--- +title: "Test object schema" +sidebar_label: "Test" +id: "discovery-schema-job-test" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The test object allows you to query information about a particular test. + +### Arguments + +When querying for a `test`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the test object. + +### Example query + +The example query below outputs information about a test including the state of the test result. In order of severity, the result can be one of these: "error", "fail", "warn", or "pass". + +```graphql +{ + job(id: 123) { + test(uniqueId: "test.internal_analytics.not_null_metrics_id") { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for a `test`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx new file mode 100644 index 00000000000..efcef674c55 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job-tests.mdx @@ -0,0 +1,43 @@ +--- +title: "Tests object schema" +sidebar_label: "Tests" +id: "discovery-schema-job-tests" +--- + +import { NodeArgsTable, SchemaTable } from "./schema"; + +The tests object allows you to query information about all tests in a given job. + +### Arguments + +When querying for `tests`, the following arguments are available. + + + +Below we show some illustrative example queries and outline the schema (all possible fields you can query) of the tests object. + +### Example query + +The example query below finds all tests in this job and includes information about those tests. + +```graphql +{ + job(id: 123) { + tests { + runId + accountId + projectId + uniqueId + name + columnName + state + } + } +} +``` + +### Fields + +When querying for `tests`, the following fields are available: + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx new file mode 100644 index 00000000000..8b02c5601ad --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/schema-discovery-job.mdx @@ -0,0 +1,64 @@ +--- +title: "Job object schema" +sidebar_label: "Job" +id: "discovery-schema-job" +pagination_next: "docs/dbt-cloud-apis/discovery-schema-job-model" +pagination_prev: null +--- + +import { QueryArgsTable, SchemaTable } from "./schema"; + +The job object allows you to query information about a particular model based on `jobId` and, optionally, a `runId`. + +If you don't provide a `runId`, the API returns information on the latest runId of a job. + +The [example query](#example-query) illustrates a few fields you can query in this `job` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. + +### Arguments + +When querying for `job`, you can use the following arguments. + + + + +### Example Query + +You can use your production job's `id`. + +```graphql +query JobQueryExample { + # Provide runId for looking at specific run, otherwise it defaults to latest run + job(id: 940) { + # Get all models from this job's latest run + models(schema: "analytics") { + uniqueId + executionTime + } + + # Or query a single node + source(uniqueId: "source.jaffle_shop.snowplow.event") { + uniqueId + sourceName + name + state + maxLoadedAt + criteria { + warnAfter { + period + count + } + errorAfter { + period + count + } + } + maxLoadedAtTimeAgoInS + } + } +} +``` + +### Fields +When querying an `job`, you can use the following fields. + + diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-exposure.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-exposure.mdx deleted file mode 100644 index 5297cc7da53..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-exposure.mdx +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: "Exposure" -id: "metadata-schema-exposure" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposure object allows you to query information about a particular exposure. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for an `exposure`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposure object. - -### Example Queries -#### Exposure information - -The example query below queries information about an exposure, including the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposure(jobId: 123, name: "my_awesome_exposure") { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for an `exposure`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-exposures.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-exposures.mdx deleted file mode 100644 index 54ec4bbc905..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-exposures.mdx +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: "Exposures" -id: "metadata-schema-exposures" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The exposures object allows you to query information about all exposures in a given job. You can learn more about exposures [here](/docs/build/exposures). - -### Arguments - -When querying for `exposures`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this exposures object. - -### Example Queries -#### Exposures information - -The example query below queries information about all exposures in a given job, including, for each exposure, the owner's name and email, the url, and information about parent sources and parent models. - -```graphql -{ - exposures(jobId: 123) { - runId - projectId - name - uniqueId - resourceType - ownerName - url - ownerEmail - parentsSources { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } - parentsModels { - uniqueId - } - } -} -``` - -### Fields -When querying for `exposures`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-metric.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-metric.mdx deleted file mode 100644 index 46a1e79abba..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-metric.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "Metric" -id: "metadata-schema-metric" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metric object allows you to query information about [metrics](https://docs.getdbt.com/docs/building-a-dbt-project/metrics). - -### Arguments - -When querying for a `metric`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metric object. - -### Example Queries -#### Metric information - -The example query below outputs information about a metric. Note that you can also add any field from the Model endpoint -- here we are simply selecting name. This includes schema, database, uniqueId, columns and more -- find documentation [here](/docs/dbt-cloud-apis/metadata-schema-model). - - -```graphql -{ - metric(jobId: 123, uniqueId: "metric.jaffle_shop.new_customers") { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -When querying for a `metric`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-metrics.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-metrics.mdx deleted file mode 100644 index b9077e228bd..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-metrics.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: "Metrics" -id: "metadata-schema-metrics" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The metrics object allows you to query information about [metrics](https://docs.getdbt.com/docs/building-a-dbt-project/metrics). - -### Arguments - -When querying for `metrics`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this metrics object. - -### Example Queries -#### Metrics information - -The example query returns information about all metrics in this job. - -```graphql -{ - metrics(jobId: 123) { - uniqueId - name - packageName - tags - label - runId - description - type - sql - timestamp - timeGrains - dimensions - meta - resourceType - filters { - field - operator - value - } - model { - name - } - } -} - -``` - -### Fields -metrics has access to the *same fields* as the [metric node](/docs/dbt-cloud-apis/metadata-schema-metric). The difference is that metrics can output a list, so instead of querying for fields for one specific metric, you can query for those parameters for all metrics in a run. - -When querying for `metrics`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-model.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-model.mdx deleted file mode 100644 index 9643cc84399..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-model.mdx +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: "Model" -id: "metadata-schema-model" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The model object allows you to query information about a particular model in a given job. - -### Arguments - -When querying for a `model`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this model object. - -### Example Queries -#### Finding parent models and sources - -The example query below uses the `parentsModels` and `parentsSources` fields to fetch information about a model’s parent models and parent sources. Note that we put a placeholder jobID and uniqueID, which you will have to replace. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - parentsModels { - runId - uniqueId - executionTime - } - parentsSources { - runId - uniqueId - state - } - } -} -``` - -#### Model Timing - -The example query below could be useful if we wanted to understand information around execution timing on a given model (start, end, completion). - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - runId - projectId - name - uniqueId - resourceType - executeStartedAt - executeCompletedAt - executionTime - } -} -``` - -#### Column-level information - -You can use the following example query to understand more about the columns of a given model. Note that this will only work if the job has generated documentation. For example it will work with the command `dbt docs generate`. - -```graphql -{ - model(jobId: 123, uniqueId: "model.jaffle_shop.dim_user") { - columns{ - name - index - type - comment - description - tags - meta - } - } -} -``` - - -### Fields -When querying for a `model`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-modelByEnv.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-modelByEnv.mdx deleted file mode 100644 index 657987039f0..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-modelByEnv.mdx +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: "Model By Environment" -id: "metadata-schema-modelByEnv" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - - -This model by environment object allows you to query information about a particular model based on `environmentId`. - -The [example query](#example-query) illustrates a few fields you can query in this `modelByEnvironment` object. Refer to [Fields](#fields) to see the entire schema, which provides all possible fields you can query. - -### Arguments - -When querying for `modelByEnvironment`, you can use the following arguments. - - - - -### Example Query - -You can use the `environment_id` and `model_unique_id` to return the model and its execution time for the last 10 times it was run, regardless of which job ran it! - -```graphql -{ - modelByEnvironment( - environmentId: 1, - uniqueId:"model.jaffle_shop.dim_user", - lastRunCount:10, - withCatalog: false - ){ - uniqueId - jobId, - runId, - executionTime - } -} -``` - -### Fields -When querying for `modelByEnvironment`, you can use the following fields. - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-models.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-models.mdx deleted file mode 100644 index 00bef5e1197..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-models.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: "Models" -id: "metadata-schema-models" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The models object allows you to query information about all models in a given job. - -### Arguments - -When querying for `models`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this models object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific model by providing `..` -- Find all of the models in a database and/or schema by providing `` and/or `` - -#### Finding models by their database, schema, and identifier -The example query below finds a model by its unique database, schema, and identifier. - -```graphql -{ - models(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding models by their schema -The example query below finds all models in this schema, and their respective execution times. - -```graphql -{ - models(jobId: 123, schema: "analytics") { - uniqueId - executionTime - } -} -``` - - -### Fields -Models has access to the *same fields* as the [Model node](/docs/dbt-cloud-apis/metadata-schema-model). The difference is that Models can output a list, so instead of querying for fields for one specific model, you can query for those parameters for all models within a jobID, database, etc. - -When querying for `models`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-seed.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-seed.mdx deleted file mode 100644 index c612ca45900..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-seed.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Seed" -id: "metadata-schema-seed" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seed object allows you to query information about a particular seed in a given job. - -### Arguments - -When querying for a `seed`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seed object. - -### Example Queries - -#### Seed information - -The query below pulls relevant information about a given seed. For example, we could see the load time. - -```graphql -{ - seed(jobId: 123, uniqueId: "seed.jaffle_shop.raw_customers") { - database - schema - uniqueId - name - status - error - } -} -``` - -### Fields - -When querying for a `seed`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-seeds.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-seeds.mdx deleted file mode 100644 index 38ebf34b6dd..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-seeds.mdx +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: "Seeds" -id: "metadata-schema-seeds" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The seeds object allows you to query information about a all seeds in a given job. - -### Arguments - -When querying for `seeds`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this seeds object. - -### Example Queries -#### Seeds information - -The query below pulls relevant information about all seeds in a given job. For example, we could see the load times. - -```graphql -{ - seeds(jobId: 123) { - uniqueId - name - executionTime - status - } -} -``` - -### Fields - -When querying for `seeds`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-snapshots.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-snapshots.mdx deleted file mode 100644 index 19b94c1462b..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-snapshots.mdx +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: "Snapshots" -id: "metadata-schema-snapshots" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The snapshots object allows you to query information about all snapshots in a given job. - -### Arguments - -When querying for `snapshots`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this snapshots object. - -### Example Query -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific snapshot by providing `..` -- Find all of the snapshots in a database and/or schema by providing `` and/or `` - -#### Finding snapshots information for a job -The example query returns information about all snapshots in this job. - -```graphql -{ - snapshots(jobId: 123) { - uniqueId - name - executionTime - environmentId - executeStartedAt - executeCompletedAt - } -} - -``` - -### Fields -Snapshots has access to the *same fields* as the [Snapshot node](/docs/dbt-cloud-apis/metadata-schema-snapshots). The difference is that Snapshots can output a list, so instead of querying for fields for one specific snapshot, you can query for those parameters for all snapshots within a jobID, database, etc. - -When querying for `snapshots`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-source.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-source.mdx deleted file mode 100644 index e3a15564ae7..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-source.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: "Source" -id: "metadata-schema-source" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The source object allows you to query information about a particular source in a given job. - -### Arguments - -When querying for a `source`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this source object. - -### Example Queries - -#### Source information - -The query below pulls relevant information about a given source. For example, we could see the load time and the state (“pass”, “fail”, “error”) of that source. - -```graphql -{ - source(jobId: 123, uniqueId: "source.jaffle_shop.snowplow.event") { - uniqueId - sourceName - name - state - maxLoadedAt - criteria { - warnAfter { - period - count - } - errorAfter { - period - count - } - } - maxLoadedAtTimeAgoInS - } -} -``` - -### Fields - -When querying for a `source`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-sources.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-sources.mdx deleted file mode 100644 index e124e621baa..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-sources.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Sources" -id: "metadata-schema-sources" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - - -The sources object allows you to query information about all sources in a given job. - -### Arguments - -When querying for `sources`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema of this sources object. - -### Example Queries -As we noted above, database, schema, and identifier are all optional arguments. This means that with this endpoint, you can: -- Find a specific source by providing `..` -- Find all of the sources in a database and/or schema by providing `` and/or `` - -#### Finding sources by their database, schema, and identifier -The example query below finds a source by its unique database, schema, and identifier. - -```graphql -{ - sources(jobId: 123, database:"analytics", schema: "analytics", identifier:"dim_customers") { - uniqueId - } -} -``` - -#### Finding sources by their schema -The example query below finds all sources in this schema, and their respective states (pass, error, fail). - -```graphql -{ - sources(jobId: 123, schema: "analytics") { - uniqueId - state - } -} -``` - -### Fields -Sources has access to the *same fields* as the [Source node](/docs/dbt-cloud-apis/metadata-schema-source). The difference is that Sources can output a list, so instead of querying for fields for one specific source, you can query for those parameters for all sources within a jobID, database, etc. - -When querying for `sources`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-test.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-test.mdx deleted file mode 100644 index 55f32697278..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-test.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "Test" -id: "metadata-schema-test" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The test object allows you to query information about a particular test. - -### Arguments - -When querying for a `test`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this test object. - -### Example Queries -#### Test result - -The example query below outputs information about a test, including the state of the test result. This can be one of, in order of severity, "error", "fail", "warn", "pass." - -```graphql -{ - test(jobId: 123, uniqueId: "test.internal_analytics.not_null_metrics_id") { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for a `test`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema-metadata-tests.mdx b/website/docs/docs/dbt-cloud-apis/schema-metadata-tests.mdx deleted file mode 100644 index b7e0e8fca2b..00000000000 --- a/website/docs/docs/dbt-cloud-apis/schema-metadata-tests.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Tests" -id: "metadata-schema-tests" ---- - -import { ArgsTable, SchemaTable } from "./schema"; - -The tests object allows you to query information about all tests in a given job. - - -### Arguments - -When querying for `tests`, the following arguments are available. Note that if you do not include a runId, it will default to the most recent run of the specified job: - - - -Below we show some illustrative example queries and outline the schema (all possible fields you can query) of this tests object. - -### Example Queries -#### Tests result - -The example query below finds all tests in this job, and includes information about those tests. - -```graphql -{ - tests(jobId: 123) { - runId - accountId - projectId - uniqueId - name - columnName - state - } -} -``` - -### Fields -When querying for `tests`, the following fields are available: - - diff --git a/website/docs/docs/dbt-cloud-apis/schema.jsx b/website/docs/docs/dbt-cloud-apis/schema.jsx index 47b6ddaf9cc..31568671573 100644 --- a/website/docs/docs/dbt-cloud-apis/schema.jsx +++ b/website/docs/docs/dbt-cloud-apis/schema.jsx @@ -1,11 +1,55 @@ -import React, { setState } from "react"; - +import React from "react"; import { useState, useEffect } from 'react' -const queriesQuery = `{ +const getTypeString = (typeStructure) => { + // Helper function to represent GraphQL type + if (!typeStructure) return '' + + if (typeStructure.kind === 'NON_NULL') { + return `${getTypeString(typeStructure.ofType)}!`; + } else if (typeStructure.kind === 'LIST') { + return `[${getTypeString(typeStructure.ofType)}]`; + } else if (['OBJECT', 'SCALAR', 'ENUM'].includes(typeStructure.kind)) { + return `${typeStructure.name}${getTypeString(typeStructure.ofType)}`; + } else { + return ''; + } +} + +export const ArgsTable = ({ data, name }) => { + return ( + + + + + + + + + + + {data.fields.find(d => d.name === name).args.map(function ({ name, description, type }) { + return ( + + + + + + + ) + })} + +
          FieldTypeRequired?Description
          {name}{getTypeString(type)}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
          + ) +} + +const metadataUrl = 'https://metadata.cloud.getdbt.com/graphql' +const metadataBetaUrl = 'https://metadata.cloud.getdbt.com/beta/graphql' + +const queryArgsQuery = `{ __schema { queryType { - fields { + fields(includeDeprecated: true) { name type { name @@ -20,7 +64,7 @@ const queriesQuery = `{ name description kind - ofType { name description } + ofType { kind name description } } } } @@ -28,96 +72,178 @@ const queriesQuery = `{ } }` - -export const ArgsTable = ({ queryName }) => { +export const QueryArgsTable = ({ queryName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { const fetchData = () => { - fetch('https://metadata.cloud.getdbt.com/graphql', { + fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query: queriesQuery }), + body: JSON.stringify({ query: queryArgsQuery }), }) .then((result) => result.json()) .then((data) => setData(data)) } fetchData() }, []) - if (!data) { return

          Fetching data...

          } - return ( - - - - - - - - - - - {data.data.__schema.queryType.fields.find(d=>d.name===queryName).args.map(function ({name, description, type} ) { - return ( - - - {type.ofType ? - : - - } - - - - ) - })} - -
          FieldTypeRequired?Description
          {name}{type.ofType.name}{type.name}{type.kind === 'NON_NULL' ? `Yes` : `No`}{description || `No description provided`}
          + ) } +export const NodeArgsTable = ({ parent, name, useBetaAPI }) => { + const [data, setData] = useState(null) + useEffect(() => { + const fetchData = () => { + fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + query: ` + query { + __type(name: "${parent}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + fields(includeDeprecated: true) { + name + description + args { + name + description + defaultValue + type { + ...TypeRef + } + } + } + } + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}) + }) + .then((result) => result.json()) + .then((data) => setData(data)) + } + fetchData() + }, []) + if (!data) { + return

          Fetching data...

          + } + return ( + + ) +} -export const SchemaTable = ({ nodeName }) => { +export const SchemaTable = ({ nodeName, useBetaAPI }) => { const [data, setData] = useState(null) useEffect(() => { const fetchData = () => { - fetch('https://metadata.cloud.getdbt.com/graphql', { + fetch(useBetaAPI ? metadataBetaUrl : metadataUrl, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ query: `{ - __type(name: "${nodeName}") { - fields { + body: JSON.stringify({ + query: ` + query { + __type(name: "${nodeName}") { + ...FullType + } + } + + fragment FullType on __Type { + kind + name + description + fields(includeDeprecated: true) { name description - type { - name - description - kind - ofType { - name - description - ofType { - name - description - } - } + type { + ...TypeRef } } } - }`}), + + # get several levels + fragment TypeRef on __Type { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + ofType { + kind + name + } + } + } + } + } + } + } + } + `}), }) .then((result) => result.json()) .then((data) => setData(data)) } fetchData() }, []) - if (!data) { return

          Fetching data...

          } - return ( @@ -126,19 +252,13 @@ export const SchemaTable = ({ nodeName }) => { - + - {data.data.__type.fields.map(function ({name, description, type} ) { + {data.data.__type.fields.map(function ({ name, description, type }) { return ( - {type.kind==='LIST' ? - : - (type.ofType ? - : - - ) - } + ) @@ -147,4 +267,3 @@ export const SchemaTable = ({ nodeName }) => {
          Type Description
          {name}[{type.ofType.ofType ? type.ofType.ofType.name : type.ofType.name }]{type.ofType.name}{type.name}{getTypeString(type)} {description}
          ) } - diff --git a/website/docs/docs/dbt-cloud-apis/service-tokens.md b/website/docs/docs/dbt-cloud-apis/service-tokens.md index f4d3191756a..9553f48a013 100644 --- a/website/docs/docs/dbt-cloud-apis/service-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/service-tokens.md @@ -3,87 +3,115 @@ title: "Service account tokens" id: "service-tokens" description: "Service account tokens help you define permissions for securing access to your dbt Cloud account and its projects." --- +:::info Important service account token update -## About service tokens +If you have service tokens created on or before July 18, 2023, please read [this important update](/docs/dbt-cloud-apis/service-tokens#service-token-update). + +::: Service account tokens enable you to securely authenticate with the dbt Cloud API by assigning each token a narrow set of permissions that more precisely manages access to the API. While similar to [User API tokens](user-tokens), service account tokens belong to an account rather than a user. You can use service account tokens for system-level integrations that do not run on behalf of any one user. Assign any permission sets available in dbt Cloud to your service account token, which can vary slightly depending on your plan: * Enterprise plans can apply any permission sets available to service tokens. -* Team plans can apply Account Admin, Member, Job Admin, Read-Only, and Metadata permissions sets to service tokens. +* Team plans can apply Account Admin, Member, Job Admin, Read-Only, and Metadata permissions set to service tokens. -You can assign as many permission sets as needed to one token. For more on permissions sets, see "[Enterprise Permissions](/docs/collaborate/manage-access/enterprise-permissions)." +You can assign as many permission sets as needed to one token. For more on permissions sets, see "[Enterprise Permissions](/docs/cloud/manage-access/enterprise-permissions)." -## Generating service account tokens +## Generate service account tokens -In the Account Settings view of dbt Cloud, you can click on the Service Account tokens page and generate a new token. Create and save your token somewhere safe. +You can generate service tokens if you have a Developer [license](/docs/cloud/manage-access/seats-and-users) and account admin [permissions](/docs/cloud/manage-access/about-user-access#permission-sets). To create a service token in dbt Cloud, follow these steps: -:::caution Note - -You will not be able to view this token again after generating it, so store the token somewhere safe for later use. - -::: +1. Open the **Account Settings** page by clicking the gear icon on the right-hand side. +2. On the left sidebar, click on **Service Tokens**. +3. Click the **+ New Token** button to generate a new token. +4. Once the token is generated, you won't be able to view this token again so make sure to save it somewhere safe. ## Permissions for service account tokens -You can assign service account tokens any permission set available in dbt Cloud. When you assign a permission set to a token, you will also be able to choose whether to grant that permissions to all projects in the account or to specific projects. +You can assign service account tokens to any permission set available in dbt Cloud. When you assign a permission set to a token, you will also be able to choose whether to grant those permissions to all projects in the account or to specific projects. ### Team plans using service account tokens The following permissions can be assigned to a service account token on a Team plan. **Account Admin**
          -Account Admin service tokens have full `read + write` access to an account, so please use them with caution. A Team plan refers to this permission set as an "Owner role." For more on these permissions, see [Account Admin](/docs/collaborate/manage-access/enterprise-permissions#account-admin). +Account Admin service tokens have full `read + write` access to an account, so please use them with caution. A Team plan refers to this permission set as an "Owner role." For more on these permissions, see [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin). **Metadata Only**
          -Metadata only service tokens can authorize requests to the metadata API. +Metadata-only service tokens authorize requests to the Discovery API. + +**Semantic Layer Only**
          +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. **Job Admin**
          Job admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. **Member**
          -Member service tokens can authorize requests for viewing and editing resources, triggering runs, and inviting members to the account. Tokens assigned the Member permission set will have the same permissions as a Member user. For more information about Member users, see "[Self-service permissions](/docs/collaborate/manage-access/self-service-permissions)". +Member service tokens can authorize requests for viewing and editing resources, triggering runs, and inviting members to the account. Tokens assigned the Member permission set will have the same permissions as a Member user. For more information about Member users, see "[Self-service permissions](/docs/cloud/manage-access/self-service-permissions)". **Read-only**
          Read-only service tokens can authorize requests for viewing a read-only dashboard, viewing generated documentation, and viewing source freshness reports. ### Enterprise plans using service account tokens -The following permissions can be assigned to a service account token on an Enterprise plan. For more details about these permissions, see "[Enterprise permissions](/docs/collaborate/manage-access/enterprise-permissions)." +The following permissions can be assigned to a service account token on an Enterprise plan. For more details about these permissions, see "[Enterprise permissions](/docs/cloud/manage-access/enterprise-permissions)." **Account Admin**
          -Account Admin service tokens have full `read + write` access to an account, so please use them with caution. For more on these permissions, see [Account Viewer](/docs/collaborate/manage-access/enterprise-permissions#account-admin). +Account Admin service tokens have full `read + write` access to an account, so please use them with caution. For more on these permissions, see [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin). + +**Security Admin**
          +Security Admin service tokens have certain account-level permissions. For more on these permissions, see [Security Admin](/docs/cloud/manage-access/enterprise-permissions#security-admin). + +**Billing Admin**
          +Billing Admin service tokens have certain account-level permissions. For more on these permissions, see [Billing Admin](/docs/cloud/manage-access/enterprise-permissions#billing-admin). **Metadata Only**
          -Metadata only service tokens can authorize requests to the metadata API. +Metadata-only service tokens authorize requests to the Discovery API. + +**Semantic Layer Only**
          +Semantic Layer-only service tokens authorize requests to the Semantic Layer APIs. **Job Admin**
          -Job Admin service tokens can authorize request for viewing, editing, and creating environments, triggering runs, and viewing historical runs. For more on these permissions, see [Account Viewer](/docs/collaborate/manage-access/enterprise-permissions#job-admin). +Job Admin service tokens can authorize requests for viewing, editing, and creating environments, triggering runs, and viewing historical runs. For more on these permissions, see [Job Admin](/docs/cloud/manage-access/enterprise-permissions#job-admin). **Account Viewer**
          -Account Viewer service tokens have read only access to dbt Cloud accounts. For more on these permissions, see [Account Viewer](/docs/collaborate/manage-access/enterprise-permissions#account-viewer) on the Enterprise Permissions page. +Account Viewer service tokens have read-only access to dbt Cloud accounts. For more on these permissions, see [Account Viewer](/docs/cloud/manage-access/enterprise-permissions#account-viewer) on the Enterprise Permissions page. **Admin**
          -Admin service tokens have unrestricted access to projects in dbt Cloud accounts. You have the option to grant that permission all projects in the account or grant the permission only on specific projects. For more on these permissions, see [Admin Service](/docs/collaborate/manage-access/enterprise-permissions#admin-service) on the Enterprise Permissions page. +Admin service tokens have unrestricted access to projects in dbt Cloud accounts. You have the option to grant that permission all projects in the account or grant the permission only on specific projects. For more on these permissions, see [Admin Service](/docs/cloud/manage-access/enterprise-permissions#admin-service) on the Enterprise Permissions page. **Git Admin**
          -Git admin service tokens have all the permissions listed in [Git admin](/docs/collaborate/manage-access/enterprise-permissions#git-admin) on the Enterprise Permissions page. +Git admin service tokens have all the permissions listed in [Git admin](/docs/cloud/manage-access/enterprise-permissions#git-admin) on the Enterprise Permissions page. -**Database Adminn**
          -Database admin service tokens have all the permissions listed in [Database admin](/docs/collaborate/manage-access/enterprise-permissions#database-admin) on the Enterprise Permissions page. +**Database Admin**
          +Database admin service tokens have all the permissions listed in [Database admin](/docs/cloud/manage-access/enterprise-permissions#database-admin) on the Enterprise Permissions page. **Team Admin**
          -Team admin service tokens have all the permissions listed in [Team admin](/docs/collaborate/manage-access/enterprise-permissions#team-admin) on the Enterprise Permissions page. +Team admin service tokens have all the permissions listed in [Team admin](/docs/cloud/manage-access/enterprise-permissions#team-admin) on the Enterprise Permissions page. **Job Viewer**
          -Job viewer admin service tokens have all the permissions listed in [Job viewer](/docs/collaborate/manage-access/enterprise-permissions#job-viewer) on the Enterprise Permissions page. +Job viewer admin service tokens have all the permissions listed in [Job viewer](/docs/cloud/manage-access/enterprise-permissions#job-viewer) on the Enterprise Permissions page. **Developer**
          -Developer service tokens have all the permissions listed in [Developer](/docs/collaborate/manage-access/enterprise-permissions#developer) on the Enterprise Permissions page. +Developer service tokens have all the permissions listed in [Developer](/docs/cloud/manage-access/enterprise-permissions#developer) on the Enterprise Permissions page. **Analyst**
          -Analyst admin service tokens have all the permissions listed in [Analyst](/docs/collaborate/manage-access/enterprise-permissions#analyst) on the Enterprise Permissions page. +Analyst admin service tokens have all the permissions listed in [Analyst](/docs/cloud/manage-access/enterprise-permissions#analyst) on the Enterprise Permissions page. **Stakeholder**
          -Stakeholder service tokens have all the permissions listed in [Stakeholder](/docs/collaborate/manage-access/enterprise-permissions#stakeholder) on the Enterprise Permissions page. +Stakeholder service tokens have all the permissions listed in [Stakeholder](/docs/cloud/manage-access/enterprise-permissions#stakeholder) on the Enterprise Permissions page. + + +## Service token update + +On July 18, 2023, dbt Labs made critical infrastructure changes to service account tokens. These enhancements improve the security and performance of all tokens created after July 18, 2023. To ensure security best practices are in place, we recommend you rotate your service tokens created before this date. + +To rotate your token: +1. Navigate to **Account settings** and click **Service tokens** on the left side pane. +2. Verify the **Created** date for the token is _on or before_ July 18, 2023. + +3. Click **+ New Token** on the top right side of the screen. Ensure the new token has the same permissions as the old one. +4. Copy the new token and replace the old one in your systems. Store it in a safe place, as it will not be available again once the creation screen is closed. +5. Delete the old token in dbt Cloud by clicking the **trash can icon**. _Only take this action after the new token is in place to avoid service disruptions_. + diff --git a/website/docs/docs/dbt-cloud-apis/sl-api-overview.md b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md new file mode 100644 index 00000000000..3ddbf76d152 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-api-overview.md @@ -0,0 +1,61 @@ +--- +title: "Semantic Layer APIs" +id: sl-api-overview +description: "Integrate and query metrics and dimensions in downstream tools using the Semantic Layer APIs" +tags: [Semantic Layer, API] +hide_table_of_contents: true +pagination_next: "docs/dbt-cloud-apis/sl-jdbc" +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The rapid growth of different tools in the modern data stack has helped data professionals address the diverse needs of different teams. The downside of this growth is the fragmentation of business logic across teams, tools, and workloads. + +The [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) allows you to define metrics in code (with [MetricFlow](/docs/build/about-metricflow)) and dynamically generate and query datasets in downstream tools based on their dbt governed assets, such as metrics and models. Integrating with the dbt Semantic Layer will help organizations that use your product make more efficient and trustworthy decisions with their data. It also helps you to avoid duplicative coding, optimize development workflow, ensure data governance, and guarantee consistency for data consumers. + +You can use the dbt Semantic Layer for a variety of tools and applications of data. Some common use cases are: + +* Business intelligence (BI), reporting, and analytics +* Data quality and monitoring +* Governance and privacy +* Data discovery and cataloging +* Machine learning and data science + + + +import Features from '/snippets/_sl-plan-info.md' + + + +
          + + + + + + + +
          + + diff --git a/website/docs/docs/dbt-cloud-apis/sl-graphql.md b/website/docs/docs/dbt-cloud-apis/sl-graphql.md new file mode 100644 index 00000000000..0e39f50f60a --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-graphql.md @@ -0,0 +1,462 @@ +--- +title: "GraphQL" +id: sl-graphql +description: "Integrate and use the GraphQL API to query your metrics." +tags: [Semantic Layer, APIs] +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + + +[GraphQL](https://graphql.org/) (GQL) is an open-source query language for APIs. It offers a more efficient and flexible approach compared to traditional RESTful APIs. + +With GraphQL, users can request specific data using a single query, reducing the need for many server round trips. This improves performance and minimizes network overhead. + +GraphQL has several advantages, such as self-documenting, having a strong typing system, supporting versioning and evolution, enabling rapid development, and having a robust ecosystem. These features make GraphQL a powerful choice for APIs prioritizing flexibility, performance, and developer productivity. + +## dbt Semantic Layer GraphQL API + +The dbt Semantic Layer GraphQL API allows you to explore and query metrics and dimensions. Due to its self-documenting nature, you can explore the calls conveniently through the [schema explorer](https://semantic-layer.cloud.getdbt.com/api/graphql). + +dbt Partners can use the Semantic Layer GraphQL API to build an integration with the dbt Semantic Layer. + +## Requirements to use the GraphQL API +- A dbt Cloud project on dbt v1.6 or higher +- Metrics are defined and configured +- A dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with "Semantic Layer Only” and "Metadata Only" permissions +- Your dbt project is configured and connected to a data platform + + +## Using the GraphQL API + +If you're a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right GQL connection parameters described in this document. + +Refer to [Get started with the dbt Semantic Layer](docs/use-dbt-semantic-layer/quickstart-sl) for more info. + + +### Authentication + +Authentication uses a dbt Cloud [service account tokens](/docs/dbt-cloud-apis/service-tokens) passed through a header as follows. To explore the schema, you can enter this information in the "header" section. + +``` +{"Authorization": "Bearer "} +``` + +Each GQL request also requires a dbt Cloud `environmentId`. The API uses both the service token in the header and environmentId for authentication. + +### Metadata calls + +**Fetch data platform dialect** + +In some cases in your application, it may be useful to know the dialect or data platform that's internally used for the dbt Semantic Layer connection (such as if you are building `where` filters from a user interface rather than user-inputted SQL). + +The GraphQL API has an easy way to fetch this with the following query: + +```graphql +{ + environmentInfo(environmentId: BigInt!) { + dialect + } +} +``` + +**Fetch available metrics** + +```graphql +metrics(environmentId: BigInt!): [Metric!]! +``` + +**Fetch available dimensions for metrics** + +```graphql +dimensions( + environmentId: BigInt! + metrics: [MetricInput!]! +): [Dimension!]! +``` + +**Fetch available granularities given metrics** + +Note: This call for `queryableGranularities` returns only queryable granularities for metric time - the primary time dimension across all metrics selected. + +```graphql +queryableGranularities( + environmentId: BigInt! + metrics: [MetricInput!]! +): [TimeGranularity!]! +``` + +You can also get queryable granularities for all other dimensions using the `dimensions` call: + +```graphql +{ + dimensions(environmentId: BigInt!, metrics:[{name:"order_total"}]) { + name + queryableGranularities # --> ["DAY", "WEEK", "MONTH", "QUARTER", "YEAR"] + } +} +``` + +You can also optionally access it from the metrics endpoint: + +```graphql +{ + metrics(environmentId: BigInt!) { + name + dimensions { + name + queryableGranularities + } + } +} +``` + +**Fetch measures** + +```graphql +{ + measures(environmentId: BigInt!, metrics: [{name:"order_total"}]) { + name + aggTimeDimension + } +} +``` + +`aggTimeDimension` tells you the name of the dimension that maps to `metric_time` for a given measure. You can also query `measures` from the `metrics` endpoint, which allows you to see what dimensions map to `metric_time` for a given metric: + +```graphql +{ + metrics(environmentId: BigInt!) { + measures { + name + aggTimeDimension + } + } +} +``` + +**Fetch available metrics given a set of dimensions** + +```graphql +metricsForDimensions( + environmentId: BigInt! + dimensions: [GroupByInput!]! +): [Metric!]! +``` + +**Create Dimension Values query** + +```graphql + +mutation createDimensionValuesQuery( + environmentId: BigInt! + metrics: [MetricInput!] + groupBy: [GroupByInput!]! +): CreateDimensionValuesQueryResult! + +``` + +**Create Metric query** + +```graphql +createQuery( + environmentId: BigInt! + metrics: [MetricInput!]! + groupBy: [GroupByInput!] = null + limit: Int = null + where: [WhereInput!] = null + order: [OrderByInput!] = null +): CreateQueryResult +``` + +```graphql +MetricInput { + name: String! +} + +GroupByInput { + name: String! + grain: TimeGranularity = null +} + +WhereInput { + sql: String! +} + +OrderByinput { # -- pass one and only one of metric or groupBy + metric: MetricInput = null + groupBy: GroupByInput = null + descending: Boolean! = false +} +``` + +**Fetch query result** + +```graphql +query( + environmentId: BigInt! + queryId: String! +): QueryResult! +``` + +**Metric Types** + +```graphql +Metric { + name: String! + description: String + type: MetricType! + typeParams: MetricTypeParams! + filter: WhereFilter + dimensions: [Dimension!]! + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +MetricType = [SIMPLE, RATIO, CUMULATIVE, DERIVED] +``` + +**Metric Type parameters** + +```graphql +MetricTypeParams { + measure: MetricInputMeasure + inputMeasures: [MetricInputMeasure!]! + numerator: MetricInput + denominator: MetricInput + expr: String + window: MetricTimeWindow + grainToDate: TimeGranularity + metrics: [MetricInput!] +} +``` + + +**Dimension Types** + +```graphql +Dimension { + name: String! + description: String + type: DimensionType! + typeParams: DimensionTypeParams + isPartition: Boolean! + expr: String + queryableGranularities: [TimeGranularity!]! +} +``` + +``` +DimensionType = [CATEGORICAL, TIME] +``` + +### Create Query examples + +The following section provides query examples for the GraphQL API, such as how to query metrics, dimensions, where filters, and more. + +**Query two metrics grouped by time** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}] + groupBy: [{name: "metric_time}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a time grain** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: "month"}] + ) { + queryId + } +} +``` + +Note that when using granularity in the query, the output of a time dimension with a time grain applied to it always takes the form of a dimension name appended with a double underscore and the granularity level - `{time_dimension_name}__{DAY|WEEK|MONTH|QUARTER|YEAR}`. Even if no granularity is specified, it will also always have a granularity appended to it and will default to the lowest available (usually daily for most data sources). It is encouraged to specify a granularity when using time dimensions so that there won't be any unexpected results with the output data. + +**Query two metrics with a categorical dimension** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name: "metric_time, grain: "month"}, {name: "customer__customer_type"}] + ) { + queryId + } +} +``` + +**Query with a where filter** + +The `where` filter takes a list argument (or a string for a single input). Depending on the object you are filtering, there are a couple of parameters: + + - `Dimension()` — Used for any categorical or time dimensions. If used for a time dimension, granularity is required. For example, `Dimension('metric_time').grain('week')` or `Dimension('customer__country')`. + +- `Entity()` — Used for entities like primary and foreign keys, such as `Entity('order_id')`. + +Note: If you prefer a more strongly typed `where` clause, you can optionally use `TimeDimension()` to separate out categorical dimensions from time ones. The `TimeDimension` input takes the time dimension name and also requires granularity. For example, `TimeDimension('metric_time', 'MONTH')`. + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics:[{name: "order_total"}] + groupBy:[{name: "customer__customer_type"}, {name: "metric_time", grain: "month"}] + where:[{sql: "{{ Dimension('customer__customer_type') }} = 'new'"}, {sql:"{{ Dimension('metric_time').grain('month') }} > '2022-10-01'"}] + ) { + queryId + } +} +``` + +**Query with Order** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name: "order_total"}] + groupBy: [{name: "metric_time", grain: "month"}] + orderBy: [{metric: {name: "order_total"}}, {groupBy: {name: "metric_time", grain: "month"}, descending:true}] + ) { + queryId + } +} +``` + + +**Query with Limit** + +```graphql +mutation { + createQuery( + environmentId: BigInt! + metrics: [{name:"food_order_amount"}, {name: "order_gross_profit"}] + groupBy: [{name:"metric_time, grain: "month"}, {name: "customer__customer_type"}] + limit: 10 + ) { + queryId + } +} +``` + +**Query with Explain** + +This takes the same inputs as the `createQuery` mutation. + +```graphql +mutation { + compileSql( + environmentId: BigInt! + metrics: [{name:"food_order_amount"} {name:"order_gross_profit"}] + groupBy: [{name:"metric_time, grain:"month"}, {name:"customer__customer_type"}] + ) { + sql + } +} +``` + +### Output format and pagination + +**Output format** + +By default, the output is in Arrow format. You can switch to JSON format using the following parameter. However, due to performance limitations, we recommend using the JSON parameter for testing and validation. The JSON received is a base64 encoded string. To access it, you can decode it using a base64 decoder. The JSON is created from pandas, which means you can change it back to a dataframe using `pandas.read_json(json, orient="table")`. Or you can work with the data directly using `json["data"]`, and find the table schema using `json["schema"]["fields"]`. Alternatively, you can pass `encoded:false` to the jsonResult field to get a raw JSON string directly. + + +```graphql +{ + query(environmentId: BigInt!, queryId: Int!, pageNum: Int! = 1) { + sql + status + error + totalPages + arrowResult + jsonResult(orient: PandasJsonOrient! = TABLE, encoded: Boolean! = true) + } +} +``` + +The results default to the table but you can change it to any [pandas](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_json.html) supported value. + +**Pagination** + +By default, we return 1024 rows per page. If your result set exceeds this, you need to increase the page number using the `pageNum` option. + +### Run a Python query + +The `arrowResult` in the GraphQL query response is a byte dump, which isn't visually useful. You can convert this byte data into an Arrow table using any Arrow-supported language. Refer to the following Python example explaining how to query and decode the arrow result: + + +```python +import base64 +import pyarrow as pa + +headers = {"Authorization":"Bearer "} +query_result_request = """ +{ + query(environmentId: 70, queryId: "12345678") { + sql + status + error + arrowResult + } +} +""" + +gql_response = requests.post( + "https://semantic-layer.cloud.getdbt.com/api/graphql", + json={"query": query_result_request}, + headers=headers, +) + +""" +gql_response.json() => +{ + "data": { + "query": { + "sql": "SELECT\n ordered_at AS metric_time__day\n , SUM(order_total) AS order_total\nFROM semantic_layer.orders orders_src_1\nGROUP BY\n ordered_at", + "status": "SUCCESSFUL", + "error": null, + "arrowResult": "arrow-byte-data" + } + } +} +""" + +def to_arrow_table(byte_string: str) -> pa.Table: + """Get a raw base64 string and convert to an Arrow Table.""" + with pa.ipc.open_stream(base64.b64decode(res)) as reader: + return pa.Table.from_batches(reader, reader.schema) + + +arrow_table = to_arrow_table(gql_response.json()["data"]["query"]["arrowResult"]) + +# Perform whatever functionality is available, like convert to a pandas table. +print(arrow_table.to_pandas()) +""" +order_total ordered_at + 3 2023-08-07 + 112 2023-08-08 + 12 2023-08-09 + 5123 2023-08-10 +""" +``` diff --git a/website/docs/docs/dbt-cloud-apis/sl-jdbc.md b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md new file mode 100644 index 00000000000..4d0d4f6d1a2 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-jdbc.md @@ -0,0 +1,367 @@ +--- +title: "JDBC" +id: sl-jdbc +description: "Integrate and use the JDBC API to query your metrics." +tags: [Semantic Layer, API] +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +The dbt Semantic Layer Java Database Connectivity (JDBC) API enables users to query metrics and dimensions using the JDBC protocol, while also providing standard metadata functionality. + +A JDBC driver is a software component enabling a Java application to interact with a data platform. Here's some more information about our JDBC API: + +- The Semantic Layer JDBC API utilizes the open-source JDBC driver with ArrowFlight SQL protocol. +- You can download the JDBC driver from [Maven](https://search.maven.org/remotecontent?filepath=org/apache/arrow/flight-sql-jdbc-driver/12.0.0/flight-sql-jdbc-driver-12.0.0.jar). +- The dbt Semantic Layer supports ArrowFlight SQL driver version 12.0.0 and higher. +- You can embed the driver into your application stack as needed, and you can use dbt Labs' [example project](https://github.com/dbt-labs/example-semantic-layer-clients) for reference. +- If you’re a partner or user building a homegrown application, you’ll need to install an AWS root CA to the Java Trust [documentation](https://www.amazontrust.com/repository/) (specific to Java and JDBC call). + +dbt Labs partners can use the JDBC API to build integrations in their tools with the dbt Semantic Layer + +## Using the JDBC API + +If you are a dbt user or partner with access to dbt Cloud and the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), you can [setup](/docs/use-dbt-semantic-layer/setup-sl) and test this API with data from your own instance by configuring the Semantic Layer and obtaining the right JDBC connection parameters described in this document. + +You *may* be able to use our JDBC API with tools that do not have an official integration with the dbt Semantic Layer. If the tool you use allows you to write SQL and either supports a generic JDBC driver option (such as DataGrip) or supports Dremio and uses ArrowFlightSQL driver version 12.0.0 or higher, you can access the Semantic Layer API. + +Refer to [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) for more info. + +## Authentication + +dbt Cloud authorizes requests to the dbt Semantic Layer API. You need to provide an environment ID, host, and [service account tokens](/docs/dbt-cloud-apis/service-tokens). + +## Connection parameters + +The JDBC connection requires a few different connection parameters. + +This is an example of a URL connection string and the individual components: + +``` +jdbc:arrow-flight-sql://semantic-layer.cloud.getdbt.com:443?&environmentId=202339&token=SERVICE_TOKEN +``` + +| JDBC parameter | Description | Example | +| -------------- | ----------- | ------- | +| `jdbc:arrow-flight-sql://` | The protocol for the JDBC driver. | `jdbc:arrow-flight-sql://` | +| `semantic-layer.cloud.getdbt.com` | The [access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your account's dbt Cloud region. You must always add the `semantic-layer` prefix before the access URL. | For dbt Cloud deployment hosted in North America, use `semantic-layer.cloud.getdbt.com` | +| `environmentId` | The unique identifier for the dbt production environment, you can retrieve this from the dbt Cloud URL
          when you navigate to **Environments** under **Deploy**. | If your URL ends with `.../environments/222222`, your `environmentId` is `222222`

          | +| `SERVICE_TOKEN` | dbt Cloud [service token](/docs/dbt-cloud-apis/service-tokens) with “Semantic Layer Only” and "Metadata Only" permissions. Create a new service token on the **Account Settings** page. | `token=SERVICE_TOKEN` | + +*Note — If you're testing locally on a tool like DataGrip, you may also have to provide the following variable at the end or beginning of the JDBC URL `&disableCertificateVerification=true`. + +## Querying the API for metric metadata + +The Semantic Layer JDBC API has built-in metadata calls which can provide a user with information about their metrics and dimensions. + +Refer to the following tabs for metadata commands and examples: + + + + + +Use this query to fetch all defined metrics in your dbt project: + +```bash +select * from {{ + semantic_layer.metrics() +}} +``` + + + + +Use this query to fetch all dimensions for a metric. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount'])}} +``` + + + + + +Use this query to fetch dimension values for one or multiple metrics and single dimension. + +Note, `metrics` is a required argument that lists one or multiple metrics in it, and a single dimension. + +```bash +select * from {{ +semantic_layer.dimension_values(metrics=['food_order_amount'], group_by=['customer__customer_name'])}} +``` + + + + + +Use this query to fetch queryable granularities for a list of metrics. This API request allows you to only show the time granularities that make sense for the primary time dimension of the metrics (such as `metric_time`), but if you want queryable granularities for other time dimensions, you can use the `dimensions()` call, and find the column queryable_granularities. + +Note, `metrics` is a required argument that lists one or multiple metrics in it. + +```bash +select * from {{ + semantic_layer.queryable_granularities(metrics=['food_order_amount', 'order_gross_profit'])}} +``` + + + + + + + + + +Use this query to fetch available metrics given dimensions. This command is essentially the opposite of getting dimensions given a list of metrics. + +Note, `group_by` is a required argument that lists one or multiple dimensions in it. + +```bash +select * from {{ + semantic_layer.metrics_for_dimensions(group_by=['customer__customer_type']) + +}} +``` + + + + + +Use this example query to fetch available granularities for all time dimesensions (the similar queryable granularities API call only returns granularities for the primary time dimensions for metrics). The following call is a derivative of the `dimensions()` call and specifically selects the granularities field. + +```bash +select NAME, QUERYABLE_GRANULARITIES from {{ + semantic_layer.dimensions( + metrics=["order_total"] + ) +}} + +``` + + + + + +It may be useful in your application to expose the names of the time dimensions that represent `metric_time` or the common thread across all metrics. + +You can first query the `metrics()` argument to fetch a list of measures, then use the `measures()` call which will return the name(s) of the time dimensions that make up metric time. + +```bash +select * from {{ + semantic_layer.measures(metrics=['orders']) +}} +``` + + + + +## Querying the API for metric values + +To query metric values, here are the following parameters that are available: + +| Parameter | Description | Example | Type | +| --------- | -----------| ------------ | -------------------- | +| `metrics` | The metric name as defined in your dbt metric configuration | `metrics=['revenue']` | Required | +| `group_by` | Dimension names or entities to group by. We require a reference to the entity of the dimension (other than for the primary time dimension), which is pre-appended to the front of the dimension name with a double underscore. | `group_by=['user__country', 'metric_time']` | Optional | +| `grain` | A parameter specific to any time dimension and changes the grain of the data from the default for the metric. | `group_by=[Dimension('metric_time')`
          `grain('week\|day\|month\|quarter\|year')]` | Optional | +| `where` | A where clause that allows you to filter on dimensions and entities using parameters. This takes a filter list OR string. Inputs come with `Dimension`, and `Entity` objects. Granularity is required if the `Dimension` is a time dimension | `"{{ where=Dimension('customer__country') }} = 'US')"` | Optional | +| `limit` | Limit the data returned | `limit=10` | Optional | +|`order` | Order the data returned by a particular field | `order_by=['order_gross_profit']`, use `-` for descending, or full object notation if the object is operated on: `order_by=[Metric('order_gross_profit').descending(True)`] | Optional | +| `compile` | If true, returns generated SQL for the data platform but does not execute | `compile=True` | Optional | + + + +## Note on time dimensions and `metric_time` + +You will notice that in the list of dimensions for all metrics, there is a dimension called `metric_time`. `Metric_time` is a reserved keyword for the measure-specific aggregation time dimensions. For any time-series metric, the `metric_time` keyword should always be available for use in queries. This is a common dimension across *all* metrics in a semantic graph. + +You can look at a single metric or hundreds of metrics, and if you group by `metric_time`, it will always give you the correct time series. + +Additionally, when performing granularity calculations that are global (not specific to a particular time dimension), we recommend you always operate on `metric_time` and you will get the correct answer. + +Note that `metric_time` should be available in addition to any other time dimensions that are available for the metric(s). In the case where you are looking at one metric (or multiple metrics from the same data source), the values in the series for the primary time dimension and `metric_time` are equivalent. + + +## Examples + +Refer to the following examples to help you get started with the JDBC API. + +### Fetch metadata for metrics + +You can filter/add any SQL outside of the templating syntax. For example, you can use the following query to fetch the name and dimensions for a metric: + +```bash +select name, dimensions from {{ + semantic_layer.metrics() + }} + WHERE name='food_order_amount' +``` + +### Query common dimensions + +You can select common dimensions for multiple metrics. Use the following query to fetch the name and dimensions for multiple metrics: + +```bash +select * from {{ + semantic_layer.dimensions(metrics=['food_order_amount', 'order_gross_profit']) + }} +``` + +### Query grouped by time + +The following example query uses the [shorthand method](#faqs) to fetch revenue and new customers grouped by time: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount','order_gross_profit'], + group_by=['metric_time']) + }} +``` + +### Query with a time grain + +Use the following example query to fetch multiple metrics with a change in time dimension granularities: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month')]) + }} +``` + +### Group by categorical dimension + +Use the following query to group by a categorical dimension: + +```bash +select * from {{ + semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'), 'customer__customer_type']) + }} +``` + +### Query with where filters + +Where filters in API allow for a filter list or string. We recommend using the filter list for production applications as this format will realize all benefits from the where possible. + +Where Filters have a few objects that you can use: + +- `Dimension()` - This is used for any categorical or time dimensions. If used for a time dimension, granularity is required - `Dimension('metric_time').grain('week')` or `Dimension('customer__country')` + +- `Entity()` - Used for entities like primary and foreign keys - `Entity('order_id')` + +Note: If you prefer a more explicit path to create the `where` clause, you can optionally use the `TimeDimension` feature. This helps separate out categorical dimensions from time-related ones. The `TimeDimesion` input takes the time dimension name and also requires granularity, like this: `TimeDimension('metric_time', 'MONTH')`. + + +Use the following example to query using a `where` filter with the string format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where="{{ Dimension('metric_time').grain('month') }} >= '2017-03-09' AND {{ Dimension('customer__customer_type' }} in ('new') AND {{ Entity('order_id') }} = 10") +}} +``` + +Use the following example to query using a `where` filter with a filter list format: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], +group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], +where=[{{ Dimension('metric_time').grain('month') }} >= '2017-03-09', {{ Dimension('customer__customer_type' }} in ('new'), {{ Entity('order_id') }} = 10]) +}} +``` + +### Query with a limit + +Use the following example to query using a `limit` or `order_by` clauses: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10) + }} +``` +### Query with Order By Examples + +Order By can take a basic string that's a Dimension, Metric, or Entity and this will default to ascending order + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10, + order_by=['order_gross_profit'] + }} +``` + +For descending order, you can add a `-` sign in front of the object. However, you can only use this short hand notation if you aren't operating on the object or using the full object notation. + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time')], + limit=10, + order_by=[-'order_gross_profit'] + }} +``` +If you are ordering by an object that's been operated on (e.g., change granularity), or you are using the full object notation, descending order must look like: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('week')], + limit=10, + order_by=[Metric('order_gross_profit').descending(True), Dimension('metric_time').grain('week').descending(True) ] + }} +``` + +Similarly, this will yield ascending order: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('week')], + limit=10, + order_by=[Metric('order_gross_profit'), Dimension('metric_time').grain('week')] + }} +``` + + +### Query with compile keyword + +Use the following example to query using a `compile` keyword: + +```bash +select * from {{ +semantic_layer.query(metrics=['food_order_amount', 'order_gross_profit'], + group_by=[Dimension('metric_time').grain('month'),'customer__customer_type'], + compile=True) + }} +``` + +## FAQs + +- **Why do some dimensions use different syntax, like `metric_time` versus `[Dimension('metric_time')`?**
          + When you select a dimension on its own, such as `metric_time` you can use the shorthand method which doesn't need the “Dimension” syntax. However, when you perform operations on the dimension, such as adding granularity, the object syntax `[Dimension('metric_time')` is required. + +- **What does the double underscore `"__"` syntax in dimensions mean?**
          + The double underscore `"__"` syntax indicates a mapping from an entity to a dimension, as well as where the dimension is located. For example, `user__country` means someone is looking at the `country` dimension from the `user` table. + +- **What is the default output when adding granularity?**
          + The default output follows the format `{time_dimension_name}__{granularity_level}`. So for example, if the time dimension name is `ds` and the granularity level is yearly, the output is `ds__year`. + +## Related docs + +- [dbt Semantic Layer integration best practices](/guides/dbt-ecosystem/sl-partner-integration-guide) + diff --git a/website/docs/docs/dbt-cloud-apis/sl-manifest.md b/website/docs/docs/dbt-cloud-apis/sl-manifest.md new file mode 100644 index 00000000000..6ecac495869 --- /dev/null +++ b/website/docs/docs/dbt-cloud-apis/sl-manifest.md @@ -0,0 +1,100 @@ +--- +title: "Semantic manifest" +id: sl-manifest +description: "Learn about the semantic manifest.json file and how you can use artifacts to gain insights about your dbt Semantic Layer." +tags: [Semantic Layer, APIs] +sidebar_label: "Semantic manifest" +pagination_next: null +--- + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + + + +dbt creates an [artifact](/reference/artifacts/dbt-artifacts) file called the _Semantic Manifest_ (`semantic_manifest.json`), which MetricFlow requires to build and run metric queries properly for the dbt Semantic Layer. This artifact contains comprehensive information about your dbt Semantic Layer. It is an internal file that acts as the integration point with MetricFlow. + +By using the semantic manifest produced by dbt Core, MetricFlow will instantiate a data flow plan and generate SQL from Semantic Layer query requests. It's a valuable reference that you can use to understand the structure and details of your data models. + +Similar to the [`manifest.json` file](/reference/artifacts/manifest-json), the `semantic_manifest.json` also lives in the `/target` directory of your dbt project. This is where dbt stores various artifacts (such as compiled models and tests) generated during the execution of your project. + +## How it's produced + +The `semantic_manifest.json` is produced whenever your dbt project is parsed. The easiest way to generate the file yourself is to run `dbt parse`. Since `dbt run`, `dbt build`, and `dbt compile` all parse your dbt project, these commands will generate a semantic manifest as well. + + +## Top level keys + +Top-level keys for the semantic manifest are: +- `semantic_models` — Starting points of data with entities, dimensions, and measures, and correspond to models in your dbt project. +- `metrics` — Functions combining measures, constraints, and so on to define quantitative indicators. +- `project_configuration` — Contains information around your project configurations + +
          +Example target/semantic_manifest.json file + +```json +{ + "semantic_models": [ + { + "name": "semantic model name", + "defaults": null, + "description": "semantic model description", + "node_relation": { + "alias": "model alias", + "schema_name": "model schema", + "database": "model db", + "relation_name": "Fully qualified relation name" + }, + "entities": ["entities in the semantic model"], + "measures": ["measures in the semantic model"], + "dimensions": ["dimensions in the semantic model" ], + "metrics": [ + { + "name": "name of the metric", + "description": "metric description", + "type": "metric type", + "type_params": { + "measure": { + "name": "name for measure", + "filter": "filter for measure", + "alias": "alias for measure" + }, + "numerator": null, + "denominator": null, + "expr": null, + "window": null, + "grain_to_date": null, + "metrics": ["metrics used in defining the metric. this is used in derived metrics"], + "input_measures": [] + }, + "filter": null, + "metadata": null + } + ], + "project_configuration": { + "time_spine_table_configurations": [ + { + "location": "fully qualified table name for timespine", + "column_name": "date column", + "grain": "day" + } + ], + "metadata": null, + "dsi_package_version": {} + } +} + ] +} +``` + +
          + +## Related docs + +- [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) +- [About dbt artifacts](/reference/artifacts/dbt-artifacts) + diff --git a/website/docs/docs/dbt-cloud-apis/user-tokens.md b/website/docs/docs/dbt-cloud-apis/user-tokens.md index 6ebf160dda4..77e536b12a5 100644 --- a/website/docs/docs/dbt-cloud-apis/user-tokens.md +++ b/website/docs/docs/dbt-cloud-apis/user-tokens.md @@ -1,11 +1,12 @@ --- title: "User tokens" id: "user-tokens" +pagination_next: "docs/dbt-cloud-apis/service-tokens" --- ## User API tokens -Each dbt Cloud user with a [Developer license](cloud-seats-and-users) is +Each dbt Cloud user with a [Developer license](/docs/cloud/manage-access/seats-and-users) is issued an API token. This token can be used to execute queries against the dbt Cloud API on the user's behalf. User API tokens inherit the permissions of the user the that they were created for. @@ -13,4 +14,9 @@ permissions of the user the that they were created for. You can find your User API token in the Profile page under the `API Access` label. - + + +## FAQs + + + diff --git a/website/docs/docs/dbt-cloud-environments.md b/website/docs/docs/dbt-cloud-environments.md new file mode 100644 index 00000000000..8fa4522d47c --- /dev/null +++ b/website/docs/docs/dbt-cloud-environments.md @@ -0,0 +1,48 @@ +--- +title: "dbt Cloud environments" +id: "dbt-cloud-environments" +description: "Learn about dbt Cloud's development environment to execute your project in the IDE" +pagination_next: null +--- + +An environment determines how dbt Cloud will execute your project in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) or [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) (for development) and scheduled jobs (for deployment). + +Critically, in order to execute dbt, environments define three variables: + +1. The version of dbt Core that will be used to run your project +2. The warehouse connection information (including the target database/schema settings) +3. The version of your code to execute + +Each dbt Cloud project can have only one [development environment](#create-a-development-environment), but there is no limit to the number of [deployment environments](/docs/deploy/deploy-environments), providing you the flexibility and customization to tailor the execution of scheduled jobs. + +Use environments to customize settings for different stages of your project and streamline the execution process by using software engineering principles. This page will detail the different types of environments and how to intuitively configure your development environment in dbt Cloud. + + +import CloudEnvInfo from '/snippets/_cloud-environments-info.md'; + + + + +## Create a development environment + +To create a new dbt Cloud development environment: + +1. Navigate to **Deploy** -> **Environments** +2. Click **Create Environment**. +3. Select **Development** as the environment type. +4. Fill in the fields under **General Settings** and **Development Credentials**. +5. Click **Save** to create the environment. + +### Set developer credentials + +To use the dbt Cloud IDE or dbt Cloud CLI, each developer will need to set up [personal development credentials](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#access-the-cloud-ide) to your warehouse connection in their **Profile Settings**. This allows you to set separate target information and maintain individual credentials to connect to your warehouse. + + + + + +## Deployment environment + +Deployment environments in dbt Cloud are necessary to execute scheduled jobs and use other features. A dbt Cloud project can have multiple deployment environments, allowing for flexibility and customization. However, a dbt Cloud project can only have one deployment environment that represents the production source of truth. + +To learn more about dbt Cloud deployment environments and how to configure them, visit the [Deployment environments](/docs/deploy/deploy-environments) page. For our best practices guide, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) for more info. diff --git a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version.md b/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version.md deleted file mode 100644 index c4a7e39d91c..00000000000 --- a/website/docs/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version.md +++ /dev/null @@ -1,74 +0,0 @@ ---- -title: "Choosing a dbt version" -id: "cloud-choosing-a-dbt-version" ---- - -In dbt Cloud, both jobs and environments are configured to use a specific version of dbt Core. The version can be upgraded at any time. - -### Environments - -Navigate to the settings page of an environment, then click **edit**. Click the **dbt Version** dropdown bar and make your selection. From this list, you can select an available version of Core to associate with this environment. - - - -Be sure to save your changes before navigating away. - -### Jobs - -Each job in dbt Cloud can be configured to inherit parameters from the environment it belongs to. - - - -The example job seen in the screenshot above belongs to the environment "Prod". It inherits the dbt version of its environment as shown by the **Inherited from ENVIRONMENT_NAME (DBT_VERSION)** selection. You may also manually override the dbt version of a specific job to be any of the current Core releases supported by Cloud by selecting another option from the dropdown. - -## Supported Versions - -We have always encouraged our customers to upgrade dbt Core versions whenever a new minor version is released. We released our first major version of dbt - `dbt 1.0` - in December 2021. Alongside this release, we updated our policy on which versions of dbt Core we will support in dbt Cloud. - - - > **Starting with v1.0, any subsequent minor versions will be supported in dbt Cloud for 1 year post release. At the end of the 1 year window, accounts must upgrade to a supported version of dbt or risk service disruption.** - -We will continue to update this table so that customers know when we plan to stop supporting different versions of Core in dbt Cloud. - - - - -:::warning ⚠️ v0.X Non-Supported Period - Accounts had until the end of June 2022 to upgrade to dbt 1.0 or later. Pre-dbt 1.0 versions will no longer receive patch fixes, and our support team will no longer assist with dbt version specific help on non-supported versions of dbt. Additionally, jobs running dbt versions prior to 1.0 may experience service disruptions before the end of the year and may be removed from the dbt Cloud context by year end. You will receive additional notification before any planned disruption to your production jobs. -::: - -Starting in v1.0, dbt Cloud will ensure that you're always using the latest compatible patch release of `dbt-core` and plugins, including all the latest fixes. You may also choose to try prereleases of those patch releases before they are generally available. - - - -For more on version support and future releases, see [Understanding dbt Core versions](core-versions). - -#### What will actually happen on the end of support date? - -1 year post a minor version release of v1.X, we will try to run our users' projects on the latest release of dbt if they have not already upgraded their projects themselves. In a post dbt v1.0 world, there won't be breaking changes between minor versions of dbt, so we might be reasonably successful at upgrading our users' versions for them. However, our strong preference is for accounts to try to manage the upgrade process themselves which is a more cautious way to prevent failures to their production pipelines. We will give accounts consistent communication that they're hitting the end of their supported window, so they can plan accordingly. - -#### What should you be doing today? - -You should **upgrade to v1.0 as soon as you can** - and we recommend that you proceed **slowly and steadily**. - -Why? Because attempting to upgrade 6 minor versions at one time (v0.15.0 —> v0.21.0) implies 6x the potential for breaking changes, versus upgrading a single minor version. - -Refactoring code is much easier when you're updating a well-defined, constrained surface area. Doing things incrementally is the way to go. - -Additionally upgrading to more recent versions of dbt Core will enable better performance and more features in dbt Cloud. Below is a compatability matrix between dbt versions and dbt Cloud features. Hopefully this provides more motivation to always update your environments and jobs to run the latest version of dbt. - -| dbt Cloud Feature | dbt Core Version Needed | -| ------------- | -------------- | -| [Environment variable secret scrubbing](/docs/build/environment-variables#handling-secrets)| v1.0+ | -| DAG in the IDE | v0.20.0+| -| [Metadata API](/docs/dbt-cloud-apis/metadata-api) |v0.19.0+| -| [Dashboard status tiles](/docs/deploy/dashboard-status-tiles) | v0.19.0+ | -| [Slim CI](/docs/deploy/cloud-ci-job) | v0.18.0+ | - -#### Need help upgrading? - -If you want more advice on how to upgrade your dbt projects, check out our [migration guides](/guides/migration/versions/) and our [upgrading Q&A page](/docs/dbt-versions/upgrade-core-in-cloud). diff --git a/website/docs/docs/dbt-cloud/cloud-ide/ide-beta.md b/website/docs/docs/dbt-cloud/cloud-ide/ide-beta.md deleted file mode 100644 index 7e8fdc16c0b..00000000000 --- a/website/docs/docs/dbt-cloud/cloud-ide/ide-beta.md +++ /dev/null @@ -1,221 +0,0 @@ ---- -title: "Develop in the dbt Cloud IDE (beta)" -id: "ide-beta" ---- - - - -:::info Join our beta - -If you’d like to try the dbt Cloud IDE for multi-tenant instances, please [sign up](https://docs.google.com/forms/d/e/1FAIpQLSdlU65gqTZPyGAUc16SkxqTc50NO9vdq_KGx1Mjm_4FB_97FA/viewform) to join the beta. To learn more about the beta features, you can read this documentation. - -::: - -## Overview - -The dbt Cloud integrated development environment (IDE) is where you can build, test, run, and version control your dbt projects directly from your browser. The IDE is the fastest and most reliable way to deploy dbt, and provides a real-time editing and execution environment for your dbt project -- no command line use required. - -To develop in dbt Cloud IDE (beta), you need to meet these requirements: - - -- Your dbt project must be compatible with dbt v0.15.0. The dbt IDE is powered by the [dbt-rpc](reference/commands/rpc) which was overhauled in dbt v0.15.0. -- You must have a [Developer License](/docs/collaborate/manage-access/seats-and-users). -- Currently only multi-tenant instances of dbt Cloud can develop in the updated beta version of the Cloud IDE. Single-tenant instances will soon be able to opt into this Beta release. -- Your dbt repository (in dbt Cloud) must have `write` access enabled. See [Connecting your GitHub Account](/docs/collaborate/git/connect-github) and [Importing a project by git URL](/docs/collaborate/git/import-a-project-by-git-url) for detailed setup instructions. - - -The IDE is a single interface for building, testing, running, and version controlling dbt projects from your browser. Anyone can use the IDE, from new dbt developers to seasoned practitioners. - - -To use the dbt Cloud IDE, you need to log in with a dbt Cloud account and click **Develop** at the top of the page. - -You can refer to [Getting Started with dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud) to quickly get set up and perform some key tasks. For more information, see the following articles: - -- [What is dbt?](docs/introduction#what-else-can-dbt-do) -- [Building your first project](/docs/get-started/getting-started/building-your-first-project) -- [dbt Learn courses](https://courses.getdbt.com/collections) -- [Using Git](https://docs.github.com/en/github/getting-started-with-github/using-git) - -**Is there a cost to using the dbt Cloud IDE?** - -Not at all! You can use dbt Cloud when you sign up for the Free [Developer plan](https://www.getdbt.com/pricing/), which comes with one developer seat. If you’d like to access more features or have more developer seats, you can upgrade your account to the Team or Enterprise plan. See dbt [Pricing plans](https://www.getdbt.com/pricing/) for more details. - -**Can I be a contributor to dbt Cloud?** - -Anyone can contribute to the dbt project. And whether it's a dbt package, a plugin, dbt-core, or this documentation site, contributing to the open source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and give back to the community. See [Contributing](/docs/contributing/oss-expectations) for details on what to expect when contributing to the dbt open source software (OSS). - -**What is the difference between developing on the dbt Cloud IDE and on the CLI?** - -There are two main ways to develop with dbt: using the web-based IDE in dbt Cloud or using the command-line interface (CLI) in dbt Core. - -- **dbt Cloud IDE** - dbt Cloud is a Web-based application that allows you to develop dbt projects with the IDE, includes a purpose-built scheduler, and provides an easier way to share your dbt documentation with your team. The IDE is a faster and more reliable way to deploy your dbt models, and provides a real-time editing and execution environment for your dbt project. - -- **dbt Core CLI** - The CLI uses [dbt Core](docs/introduction), an [open-source](https://github.com/dbt-labs/dbt) software that’s freely available. You can build your dbt project in a code editor, like Jetbrains or VSCode, and run dbt commands from the command line. - -**What type of support is provided with dbt Cloud?** - -The global dbt Support team is available to help dbt Cloud users by email or in-product live chat. Developer and Team accounts offer 24x5 support, while Enterprise customers have priority access and options for custom coverage. - -If you have project-related or modeling questions, you can use our dedicated [Community Forum](/community/forum) to get help as well. - -## dbt Cloud IDE features - -With dbt Cloud IDE, you can: - -- Write modular SQL models with `select` statements and the [`ref()`](/reference/dbt-jinja-functions/ref) function -- Test every model before deploying them to production -- Share the generated documentation of your models with all data stakeholders -- Deploy safely using development environments like how git-enabled version control enables collaboration and a return to previous states - - -**Find and replace** - -Press Command-F or Ctrl-F to open the find and replace bar in the upper right corner of the current file in the IDE. The IDE highlights your search results in the current file and code outline. You can use the up and down arrows to see the match highlighted in the current file when there are multiple matches. To replace the text with something else, use the left arrow. - -**Search across files** - -You can quickly search over all files in the IDE on your current project. To search, open the search bar by pressing Command-O or Ctrl-O to find text across all files in your current project. and write your file name. You can view the results under the search text, which are grouped into files containing the match. You can click on the results to view it in the IDE. - -**Keyboard shortcuts** - -There are default keyboard shortcuts that can help make development more productive and easier for everyone. Press Fn-F1 to view a list of all of them. - -**Multiple selections** - -You can make multiple selections for quick and simultaneous edits. The below commands are a common way to add more cursors and allow you to insert cursors below or above with ease. - -- Option-Command-Down arrow -- Option-Command-Up arrow -- Press Option and click on an area - - - -**File explorer** - -The File explorer on the left side of the IDE allows you to organize your project and manage your files and folders. Click the three dot menu associated with the file or folder to create, rename, and delete it. - -**Drag and drop** - -You can also drag and drop files located in the file explorer. Use the file breadcrumb on the top of the IDE for quick, linear navigation. You can access adjacent files in the same file by right clicking on the breadcrumb file. - - - -**Organize tabs** - -You can move your tabs around to reorganize your work in the IDE. You can also right click on a tab to view and select a list of actions to take. - - - -## Development in the dbt Cloud IDE Beta - -With the IDE, you can compile dbt code into SQL and run it against your database directly. It leverages the open-source [dbt-rpc](/reference/commands/rpc) plugin to intelligently recompile only the changes in your project. - -The dbt Cloud IDE Beta brings the startup and interaction time for dbt project development down from minutes to seconds. - -In dbt, SQL files can contain Jinja, a lightweight templating language. Using Jinja in SQL provides a way to use control structures (e.g. `if` statements and `for` loops) in your queries. It also lets you share SQL code through `macros`. - -You can invoke dbt commands, compile jinja into query, preview data from the warehouse, visualize a directed acyclic graph (DAG), and more. - -**Hot and cold start** - -You can launch the dbt Cloud IDE from a cold start or a hot start. - -- **Cold start** -- The process of starting an IDE session for the first time. Cold starting the IDE can take about 30 seconds to load. Behind the scene, dbt is provisioning a dedicated server for you to build a dbt project. After this step finishes, the IDE is ready for use. In the meantime, dbt is also starting up the dbt-rpc container to interact with dbt-core. You don’t need to wait for this to finish before the IDE is ready for editing. - -- **Hot start** -- The process of resuming an existing IDE session (within 3 hours of the last activity). Hot starting is faster and takes less than 3 seconds to load. This is because the environment is already available and you’re simply resuming your session. - -dbt Labs closely monitors these two start modes as key performance metrics to help ensure consistent and reliable experiences. - -**Work retention** - -You must save your work to avoid losing it. The dbt Cloud IDE needs an explicit action to save your changes. There are three ways your work is stored: - -- **Unsaved, local code** -- Any code you write is automatically available from your browser’s storage. You can see your changes but will lose it if you switch branches or browsers (another device or browser). -- **Saved but uncommitted code** -- When you save a file, the data gets stored in your local storage (EFS storage). If you switch branches but don’t _commit_ your saved changes, you will lose your changes. -- **Committed code** -- Your git branch repository contains all your changes. You can check out other branches or switch browsers without losing your changes. - - -**Run projects** - -You can also *build*, *run* *and test* dbt projects directly in the dbt IDE using our ‘Build’ feature. You can use dbt's [rich model selection syntax](https://docs.getdbt.com/reference/node-selection/syntax) to [run dbt commands](https://docs.getdbt.com/reference/dbt-commands) directly within dbt Cloud. - -The IDE updates in real-time as models, tests, seeds, and operations are run. If a model or test fails, you can review the logs to find and fix the issue. - - - -**Lineage tab** - -The visual in the Lineage tab adds more context to your dependencies and directional flow. - -You get to see how models are used as building blocks from left to right to transform your data from crude or normalized raw sources, into cleaned-up modular derived pieces, and finally into the final outputs on the far right of the DAG, ready to be used by the analyst in infinite combinations to present it in ways to help clients, customers, and organizations make better decisions. - -You can access files in the lineage tab by double clicking on a particular model. - -**Command bar + status** - -You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to [run dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking **History**. - -:::info Note - -For your convenience, dbt Cloud automatically includes ‘`dbt`’ as a prefix to your command so you don’t need to enter it. You can also type the ‘`dbt`’ prefix in your command. -::: - - -The status icon on the lower right corner of the IDE gives you an indicator of the health of your project. You can identify errors by clicking on the status icon for more details or by clicking **Restart the IDE**. - -**Generating and viewing documentation** - -To generate your project’s documentation (docs) in the IDE, enter `docs generate` or `dbt docs generate` in the command bar. This command generates the docs for your dbt project as it currently exists in development. - -After you generate a successful run, you can view your [documentation](https://docs.getdbt.com/docs/building-a-dbt-project/documentation) for your dbt project in real time. You can inspect and verify what your project's documentation will look like before you deploy your changes to production. - -Click **View Docs** on top of the file explorer to see the latest version of your documentation rendered in a new browser window. - - - -## Version control basics - -A [version control](https://en.wikipedia.org/wiki/Version_control) system allows you and your teammates to work collaboratively, safely, and simultaneously on a single project. Version control helps you track all the code changes made in the dbt Cloud IDE. - -When you develop in the dbt Cloud IDE, you can leverage Git directly to version control your code from your browser. This means you can branch, commit, push, and pull code with a couple of clicks - no command line required! - -You can create a separate branch to develop and make changes. The changes you make aren’t merged into the main branch unless it successfully passes tests. This helps keep the code organized and improves productivity by making the development process smooth. - - - -:::info Note -To use version control, make sure you are connected to a Git repository in the IDE. -::: - -**Common git terms** - - -| Name | Definition | -| --- | --- | -| Repository or repo | A repository is a directory that stores all the files, folders, and content needed for your project. You can think of this as an object database of the project, storing everything from the files themselves to the versions of those files, commits, and deletions. Repositories are not limited by user, and can be shared and copied.| -| Branch | A branch is a parallel version of a repository. It is contained within the repository, but does not affect the primary or main branch allowing you to work freely without disrupting the _live_ version. When you've made the changes you want to make, you can merge your branch back into the main branch to publish your changes | -| Checkout | The checkout command is used to create a new branch, change your current working branch to a different branch, or switch to a different version of a file from a different branch. | -| Commit | A commit is a user’s change to a file (or set of files). When you make a commit to save your work, Git creates a unique ID that allows you to keep a record of the specific changes committed along with who made them and when. Commits usually contain a commit message which is a brief description of what changes were made. | -| Main | The primary, base branch of all repositories. All committed and accepted changes should be on the Main (or master) branch. In the dbt Cloud IDE, the “Main” branch will be read-only. This is because any changes/edits to code cannot and should not be made directly in the base branch. A new branch should be created in order to make any changes to your project | -| Merge | Merge takes the changes from one branch and adds them into another (usually main) branch. These commits are usually first requested via pull request before being merged by a maintainer. | -| Pull Request | If someone has changed code on a separate branch of a project and wants it to be reviewed to add to the main branch, they can submit a pull request. Pull requests ask the repo maintainers to review the commits made, and then, if acceptable, merge the changes upstream. A pull happens when adding the changes to the main branch. | -| Push | A push updates a remote branch with the commits made to the current branch. You are literally “pushing” your changes onto the remote. | -| Remote | This is the version of a repository or branch that is hosted on a server. Remote versions can be connected to local clones so that changes can be synced. | - - -**The Git button in the IDE** - -The git button in the dbt Cloud IDE allows you to apply the concept of version control to your project. This page provides descriptions of each git button command and what they do: - -| Name | Actions | -| --- | --- | -| Abort merge | This option allows you to cancel a merge that had conflicts. Please note that all the changes will be reset, and this operation cannot be reverted, so make sure to commit or save all your changes before you start a merge. | -| Change branch | This option will allow you to change between branches (checkout). | -| Commit and push | Committing is similar to saving any changes made within your repo. In the above situation, the changes being saved or committed are the initialization of the project. The required files and folders are being added. When you make changes to your code in the future, you'll need to commit them as well. This allows you to record what changes were made when they were made, and who made them. | -| Create new branch | This allows you to branch off of your base branch and edit your project. You’ll notice after initializing your project that the “main branch will be read-only. This is because any changes to code cannot and should not be made directly in the base branch. A new branch should be created in order to make any changes to your project. | -| Initialize your project | This is done when first setting up your project. Initializing a project creates all required directories and files within an empty repository. Note: This option will not display if your repo isn't completely empty (i.e. includes a README file). Once you click **initialize your project** you'll want to click **commit** to finish setting up your project. | -| Open pull request | This allows you to open a pull request in Git for peers to review changes before merging into the base branch.| -| Pull changes from master/main | This option is available if you are on any local branch that is behind the remote version of the base branch or the remote version of the branch that you're currently on. | -| Pull from remote | This option is available if you’re on the local base branch and changes have recently been pushed to the remote version of the branch. As such, pulling in changes from the remote repo will allow you to pull in the most recent version of the base branch. | -| Reclone Your Repository | This allows you to reset your repository back to a fresh clone from your remote. You can use this option when you need to reclone your repo or if there are any git-related errors you’re experiencing in the dbt Cloud IDE. Reclone your repository is not available in the beta launch | -| Refresh git state | This enables you to pull new branches from a different remote branch to your local branch with just one command. | diff --git a/website/docs/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide.md b/website/docs/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide.md deleted file mode 100644 index 8e219fe1c2f..00000000000 --- a/website/docs/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: Viewing Docs in the IDE -id: viewing-docs-in-the-ide ---- - -The dbt Cloud IDE makes it possible to view [documentation](/docs/collaborate/documentation) -for your dbt project while your code is still in development. With this -workflow, you can inspect and verify what your project's generated documentation -will look like before your changes are released to production. - -## Generating documentation - -To generate documentation in the IDE, run the `dbt docs generate` command in the -Command Bar in the IDE. This command will generate the Docs -for your dbt project as it exists in development in your IDE session. - - - -After generating your documentation, you can click the "view docs" button to see -the latest version of your documentation rendered in a new browser window. - - diff --git a/website/docs/docs/dbt-cloud/cloud-overview.md b/website/docs/docs/dbt-cloud/cloud-overview.md deleted file mode 100644 index 3a436e92b5a..00000000000 --- a/website/docs/docs/dbt-cloud/cloud-overview.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: "About dbt Cloud" -id: "cloud-overview" ---- - - -[dbt Cloud](https://www.getdbt.com/product/) is a hosted service that helps data analysts and engineers productionize dbt deployments. It comes equipped with turnkey support for scheduling jobs, CI/CD, serving documentation, monitoring & alerting, and an integrated developer environment (IDE). - -dbt Cloud’s generous Developer (free) plan and deep integration with dbt Core make it well suited for data teams small and large alike. - -You can [sign up](https://www.getdbt.com/signup/) to get started with dbt Cloud. - - -## Develop dbt projects - -You can use the [dbt Cloud IDE](/docs/get-started/develop-in-the-cloud) to develop, run, and version control dbt projects on the web. - -## Schedule and run dbt jobs in production - -Set up custom schedules to run your production dbt jobs. dbt Cloud's comprehensive scheduling interface makes it possible to schedule jobs by day of week, time of day, or a recurring interval. - - - -## Democratize access to logs - -dbt Cloud makes it easy to view in-progress and historical logs for your dbt runs. From dbt Cloud, you can view and download the run logs for your dbt invocations. If you're happy ssh'ing into a cron server and running `tail -f` on a logfile, then this feature is *not* for you! - - - -## Generate and distribute documentation - -dbt Cloud hosts and authorizes access to dbt project documentation. After enabling documentation for a given job, you can click the "View Documentation" button to see the latest documentation for that job. Because these docs are generated on a schedule, they're always up to date! Simply invite your coworkers to dbt Cloud to share your project's documentation with the rest of your team. More info about enabling docs for your jobs can be found [here](/docs/collaborate/cloud-build-and-view-your-docs). - - - -## Enable Continuous Integration - -:::info Available on the Basic Tier - -Continuous integration functionality is available to accounts on the Basic Tier or higher. - -::: - -dbt Cloud can be configured to run your dbt projects in a temporary schema when new commits are pushed to open pull requests. When the Cloud job completes, a status will be shown for the PR inside of GitHub. This build-on-PR functionality is a great way to catch bugs before deploying to production, and an essential tool in any analysts belt. More info on enabling CI workflows in dbt Cloud can be found [here](/docs/deploy/cloud-ci-job). - - diff --git a/website/docs/docs/dbt-cloud/on-premises/dependencies.md b/website/docs/docs/dbt-cloud/on-premises/dependencies.md deleted file mode 100644 index 6278612476b..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/dependencies.md +++ /dev/null @@ -1,103 +0,0 @@ ---- -id: dependencies -title: External Dependencies ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -This guide is intended to help administrators running instances of dbt Cloud on-premises understand the internal components of their instance, as well as how their instance will interact with other services over the internet. - -## Required External Dependencies - -dbt Cloud has several external network dependencies that are required for normal operation. This section enumerates the required external dependencies for a normal dbt Cloud installation. - -### Replicated - -Replicated (https://www.replicated.com/) is a third-party service that provides a management layer over the dbt Cloud Kubernetes appliance. Our installer configures an additional KOTS (https://kots.io/) appliance to configure and manage the dbt Cloud application. KOTS is open-source, Apache 2 licensed software developed by Replicated. - -An overview of Replicated's security posture can be found at: https://www.replicated.com/resources/files/Replicated-Security-Whitepaper.pdf - -#### Initial Installation - -During initial installation, the KOTS appliance can be directly downloaded from the Internet or packaged up and delivered to your infrastructure. Access to the following resources is required: - -##### Accessed via HTTPS - -- `get.replicated.com`: This endpoint hosts the Replicated install script. -- `registry.replicated.com`: The private Replicated Docker registry. -- `registry-data.replicated.com`: The private Replicated Docker registry. -- `quay.io`: Some dependencies of Replicated are hosted as public images in the Quay.io registry. -- `hub.docker.com`: Some dependencies of Replicated are hosted as public images in Docker Hub. - -Replicated maintains a list of Replicated-owned IPs for IP access restriction purposes at https://github.com/replicatedhq/ips/blob/main/ip_addresses.json. - -#### dbt Cloud Appliance Installation and Upgrades - -To install the dbt Cloud appliance or perform updates, some external connections are required. All connections are initiated from inside the network, and can vary depending on the installation method and the application update. - -##### Accessed via HTTPS - -- `api.replicated.com`: This endpoint services the license sync check and used to pull down yaml for app upgrades. -- `registry.replicated.com`: The private Replicated Docker registry. -- `registry-data.replicated.com`: The private Replicated Docker registry. -- `quay.io`: Some dependencies of Replicated are hosted as public images in the Quay.io registry. -- `hub.docker.com`: Some dependencies of Replicated are hosted as public images in Docker Hub. -- `usage.getdbt.com`: Your installation will send usage data to our server once per week. The schema of the data is as follows: - -```json -{ - "accounts": [ - "id": 1, - "name": "fishtown_analytics", - "developer_licenses": 20, - "read_only_licenses": 100, - "dbt_cloud_version": "1.0.0" - ] -} - -``` - -Replicated maintains a list of Replicated-owned IPs for IP access restriction purposes at https://github.com/replicatedhq/ips/blob/main/ip_addresses.json. - -#### Ongoing Access - -In order to perform basic maintenance and license checking, the following outbound access is required: - -##### Accessed via HTTPS - -- `api.replicated.com`: This endpoint services the license sync check and is used to pull down yaml files for app upgrades. - -## Optional External Dependencies - -### Integrations - -_Can be individually enabled or disabled_ - -dbt Cloud supports integrations with a number of third-party applications. If enabled, these applications will require some further network access. - -- **Github, Github Enterprise, Github Enterprise Server**: if the Github integration is enabled, and you wish to make use of PR builds, you will need to grant Github the ability to send webhooks to your dbt Cloud instance. -- **Slack**: if enabled, you can send notifications on completed dbt Cloud runs to your Slack organization. -- **Email over SMTP**: if enabled, your dbt Cloud instance will be able to send email to your users. -- **Datadog**: if enabled, your dbt Cloud instance will attempt to send metrics and logs to Datadog. Note that this requires a valid Datadog agent installation. - - -## Inbound (Client) Traffic - -dbt Cloud requires some ports to be opened for inbound traffic from admins and end users. All inbound traffic is secured using TLS / HTTPS. Upon installation, the Replicated appliance will generate a self-signed cert, and then prompt the admin configuring the installation to provide an SSL certificate to be used for securing inbound client requests to the application. - -The required inbound ports are: - -- 443 (tcp): For end user access to the dbt Cloud application. -- 8800 (tcp): For admin access to the dbt Cloud admin console. - -### Additional Information - -For additional information related to inbound traffic view the following sections. - -- [Application Data Flows](docs/deploy/architecture#application-data-flows) -- [Data Warehouse Interaction](/docs/deploy/architecture#data-warehouse-interaction) -- [Customer Managed Network Architecture](/docs/deploy/architecture) diff --git a/website/docs/docs/dbt-cloud/on-premises/faqs.md b/website/docs/docs/dbt-cloud/on-premises/faqs.md deleted file mode 100644 index 1de5b32b498..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/faqs.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: Customer Managed FAQs ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -## Customizations - -### Adding custom ingresses - -All ingress to dbt Cloud goes through a component called the **API Gateway**. The Kubernetes deployment that makes up the API Gateway has the label `name: api-gateway` and serves up all traffic on port 8000. Within your dbt Cloud installation, you can create custom services and ingresses to these pods by targeting that set of labels with custom services. - -**Example: on an embedded cluster, add a service that does _not_ terminate TLS** - -This example only applies to an embedded cluster (dbt Cloud installed onto a VM). Note that exposing dbt Cloud traffic over http is insecure, and not recommended. - -```yaml -apiVersion: v1 -kind: Service -metadata: - name: api-gateway-http-only - labels: - name: api-gateway-http-only -spec: - ports: - - name: http - port: 8000 - targetPort: 8000 - nodePort: - selector: - name: api-gateway - type: NodePort -``` - -**Example: on an existing cluster, add a service that terminates TLS** - -This example only applies to an existing cluster (dbt Cloud installed onto pre-existing Kubernetes). It requires that you have a valid [TLS secret](https://kubernetes.io/docs/concepts/services-networking/ingress/#tls) available in the same Kubernetes namespace as the dbt Cloud application. - -We recommend the [nginx ingress controller](https://kubernetes.github.io/ingress-nginx/deploy/) as a simple way to deploy ingresses that terminate TLS in your cluster. You will need to follow the installation instructions to install the controller before proceeding. - -Replace `` with the "hostname" setting from your dbt Cloud instance. Then, apply the YAML manifest below to the namespace where dbt Cloud is installed. - -```yaml -apiVersion: networking.k8s.io/v1beta1 -kind: Ingress -metadata: - annotations: - kubernetes.io/ingress.class: nginx - name: nginx-ingress -spec: - tls: - - hosts: - - "" - # This assumes tls-secret exists and the SSL - # certificate contains a CN for - # "" - secretName: "" - rules: - - host: "" - http: - paths: - - backend: - serviceName: api-gateway - servicePort: 8000 - path: / -``` - -For more examples of using the nginx ingress controller, see [the Examples section](https://kubernetes.github.io/ingress-nginx/examples/tls-termination/) of their documentation. - -## Troubleshooting - -### Restarting the dbt Cloud Application - -Certain tasks may require restarting the dbt Cloud application such as updating a configuration value. In order to accomplish this, the below commands can be run. Note that when these commands are run, the dbt Cloud application (including the IDE and job scheduler) will be unavailable for a few minutes until the restart is complete. - -```bash -kubectl rollout restart deployment/api-gateway -kubectl rollout restart deployment/app -kubectl rollout restart deployment/scheduler -``` - -### Restarting the Configuration Console (kotsadm) - -Certain tasks may require restarting the Configuration Console (kotsadm) such as changing the TLS certificate. In order to accomplish this, the below commands can be run. Note that when these commands are run, the Configuration Console will be unavailable for a few minutes until the restart is complete. - -```bash -kubectl rollout restart deployment/kotsadm -kubectl rollout restart deployment/kotsadm-api -``` diff --git a/website/docs/docs/dbt-cloud/on-premises/index.md b/website/docs/docs/dbt-cloud/on-premises/index.md deleted file mode 100644 index 548f597b1aa..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/index.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -id: index -title: Overview ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -The Customer Managed deployment environment is configured and managed by the customer. While this deployment model allows for the greatest level of customization and ownership of the dbt Cloud infrastructure, it may not be ideal for most customers as it also comes with the greatest level of troubleshooting and maintenance costs. - -dbt Cloud is a bundled Kubernetes appliance that can be automatically installed into a standalone Kubernetes cluster. - -dbt Cloud uses a collection of open source technologies called Kubernetes-off-the-shelf ([KOTS](https://kots.io/)) to manage installations. Internally, dbt Cloud is a Kubernetes appliance, but you do not need to already use Kubernetes to run it. It supports two different deployment models: installation into an existing Kubernetes cluster, or installation onto a bare Linux instance or VM. Installation into a VM is not recommended as this adds an additional layer of complexity and lacks the reliability and scalability of a native Kubernetes deployment. - -The following pages are intended for system administrators of Customer Managed installations and describe the various steps needed to install and maintain a Customer Managed deployment. diff --git a/website/docs/docs/dbt-cloud/on-premises/installation.md b/website/docs/docs/dbt-cloud/on-premises/installation.md deleted file mode 100644 index 9ba856b27f4..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/installation.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -id: installation -title: On-Premises Installation (dbt Cloud) ---- - -:::note 📌 - -We no longer support new on-premises deployments, and instead have moved to a [Single Tenant](/docs/deploy/single-tenant) model hosted in the cloud - -::: - -Before proceeding with installation, please make sure to review the [prerequisites](/docs/dbt-cloud/on-premises/prerequisites) and [system requirements](/docs/dbt-cloud/on-premises/system-requirements). - -### Installation into an existing Kubernetes cluster - -Installation into an existing Kubernetes cluster requires two steps. First, you will install a kubectl plugin (kots) that allows you to dynamically apply dbt Cloud configurations into your existing cluster, as well as overlay your own custom Kubernetes patches. Second, you will install [kotsadm](https://github.com/replicatedhq/kotsadm), an installable admin console for managing Kubernetes appliances, including dbt Cloud. (Later in this document, we will refer to the kotsadm UI as the "Configuration Console.") Both of the required tools are open source. - -Both of the following commands must be run on the same machine, where the machine has access to the Kubernetes cluster where you want to run dbt Cloud. - -First, install the kubectl plugin by running the following command. - -```bash -curl https://kots.io/install | bash -``` - -Second, install kotsadm into your cluster by running: - -```bash -kubectl kots install dbt-cloud-v1 -``` - -The installer will immediately prompt you for a namespace to deploy both kotsadm and dbt Cloud into. You can select any namespace you like. All resources (except for a `ClusterRole` for kotsadm) will be installed into the namespace you select. - -Next, you will be prompted for a password to use to secure the admin console. Record the generated password somewhere safe, as you will need it to manage the dbt Cloud appliance. **If you lose this password, you will lose access to the admin console!** - -After installation is complete, you can serve the admin console on a local machine by running: - -``` -kubectl kots admin-console --namespace -``` - -This will serve up the admin console at `localhost:8800` on the machine running the command. - -### Installation into a VM - -SSH into the dbt Cloud VM. You can install the entire dbt Cloud application by running the following command: - -```bash -curl -sSL https://kurl.sh/dbt-cloud-v1 | sudo bash -``` - -This runs a shell script produced by [kURL](https://kurl.sh/docs/) which will: - -- bootstrap a self-contained Kubernetes cluster onto your instance, -- install the kots admin console, -- and, install the dbt Cloud application. - -If your machine has multiple IP addresses, the installer will prompt you to select an IP address on which to host the kots admin console. Choose a private IP suitable for your installation. **It is not recommended to host the kots admin console on a public IP address.** - -This will take a few minutes. After it is complete, you will see the following output: - -``` - Installation - Complete ✔ - -Kotsadm: http://:8800 -Login with password (will not be shown again): -``` - -Record the generated password somewhere safe. You will need it to log into the admin console. **If you lose this password, you will lose access to the admin console!** diff --git a/website/docs/docs/dbt-cloud/on-premises/prerequisites.md b/website/docs/docs/dbt-cloud/on-premises/prerequisites.md deleted file mode 100644 index 59c3ce08465..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/prerequisites.md +++ /dev/null @@ -1,170 +0,0 @@ ---- -id: prerequisites -title: Prerequisites ---- - -:::note - -We no longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -There are a number of prerequisites that must be in place before starting the installation process. There are also some optional prerequisites that will need to be completed to enable optional features of dbt Cloud. - -## Required - -### License - -Each dbt Cloud installation requires a License file. To obtain a license file, [contact sales](mailto:sales@getdbt.com). - -### Backend Database - -dbt Cloud uses PostgreSQL as its backend database. The application bundle can provision an embedded Postgres instance, but this is not recommended for production use. You should configure an external Postgres instance with your cloud provider to enable enhanced backup and monitoring capability. When you configure the application using the configuration UI, you will provide credentials that dbt Cloud can use to connect to this Postgres instance. - -Supported versions include 9.6, 10, and 11. The database should be configured with at least 50GB of storage. - -dbt Cloud supports authentication to the database by [password authentication](https://www.postgresql.org/docs/12/auth-password.html) only. Other [PostgreSQL authentication methods](https://www.postgresql.org/docs/12/client-authentication.html) are not currently supported. - -For major cloud providers, it is recommended to use the PostgreSQL PaaS offered by that cloud provider. These PaaS offerings include monitoring, security, and backups, which should be configured consistent with customer data policies. - -The dbt Cloud application can be rolled back at any time by restoring a PostgreSQL backup. - -#### Amazon Web Services - -Customers hosting dbt Cloud on AWS should use [RDS for PostgreSQL](https://aws.amazon.com/rds/postgresql/). The RDS instance should be a `db.t3.medium` or larger. - -#### Google Cloud Platform - -Customers hosting dbt Cloud on Google Cloud Platform should use [Cloud SQL for PostgreSQL](https://cloud.google.com/sql/docs/postgres). The instance should `db-n1-standard-1` or larger. - -#### Microsoft Azure - -Customers hosting dbt Cloud on Microsoft Azure should use [Azure Database for PostgreSQL](https://azure.microsoft.com/en-us/services/postgresql/). The instance should have at least 2 vCores and 4GB of memory. - -### Object Storage - -dbt Cloud supports S3 (and S3-compatible APIs) or Azure Blob Storage as its object storage solution for logs and run artifacts. You will need to provide two buckets (s3) or containers (azure) to dbt Cloud: one for logs generated by your dbt runs, and another for all of the other artifacts generated by your dbt runs. - -On AWS, you can use either instance profile based or AWS keypair based authentication with S3. The instance profile or keypair should be provisioned with the below IAM permissions before creating the instance and installing the application. - - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": [ - "s3:PutObject", - "s3:GetObjectAcl", - "s3:GetObject", - "s3:ListBucket", - "s3:HeadBucket" - ], - "Resource": [ - "arn:aws:s3:::", - "arn:aws:s3:::/*", - "arn:aws:s3:::", - "arn:aws:s3:::/*" - ] - } - ] -} -``` - - -For Azure, you will need the full connection string for each of the two containers. - -#### Google Cloud Storage - -For Google Cloud Storage, you will need to use the S3-compatible API. After provisioning the two buckets, you will need to generate an HMAC key (https://cloud.google.com/storage/docs/authentication/hmackeys) that has access to the two buckets. You will use this keypair as your S3 Access Key and Access Secret in the Configuration console. When you go through the configuration process, under S3 Region, pick the GCS region (or multi-region alias). For S3 Endpoint, you should use https://storage.googleapis.com. - -### SSL/TLS Certificate - -Your dbt Cloud instance should live at a domain name controlled by your organization (ex. `dbt..com`). To secure users' connections to the dbt Cloud application, you will need to provision and upload a TLS certificate to the dbt Cloud instance. - -For installations into an existing Kubernetes cluster, SSL termination should be handled by an external load balancer or a customer-managed proxy service. After installation, there will be service of type ClusterIP in the provided namespace called `api-gateway`. You must define a custom ingress or service connecting your load balancer to the dbt Cloud application. - -### Storage Classes (Existing Kubernetes Only) - -dbt Cloud requires a StorageClass that supports creating PersistentVolumeClaims in ReadWriteMany mode. Kubernetes has a number of ready-made storage provisioners for supporting different custom storage classes here: https://github.com/kubernetes-incubator/external-storage - -We recommend using NFS. - -## Optional - -### Email over SMTP - -dbt Cloud has the ability to send emails over SMTP. The emails that dbt Cloud can send include: - -- **Post-run notifications**: your users can get notified when runs finish. -- **Email invitations**: if your instance supports login via email/password, you can send email invitations that will allow users to log in. -- **Password resets**: if your instance supports login via email/password, users can reset their own passwords. - -To configure SMTP, you will provide a set of SMTP credentials in the configuration console. These credentials include: - -- **SMTP Host**: the hostname of your SMTP server. -- **SMTP Port**: the port (often 25 or 465) -- **SMTP Username**: the username to use when logging into the SMTP server. For some platforms, this will be someone's actual username. For others, you can generate a service account username. -- **SMTP Password**: the password for the given username. -- **System "From" Email Address**: The "From" email address that dbt Cloud will use. Typically this will be a "no-reply" or system address. If you'd like to enable your users to reply to these emails, you can change the "From" address accordingly. Make sure that you have configured your SMTP server to allow outbound email coming from this "From" address. - -### Github - -dbt Cloud can integrate with your Github, Github Enterprise, or Github Enterprise Server installation to enable interacting with Github Pull Requests, as well as automatically permissioning your environments via Github's API. To set this up, you will need to be an [Owner](https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/permission-levels-for-an-organization) on your Github organization. - -First, log into Github, and navigate to your organization's settings page. Click **Github Apps** (not OAuth Apps!), and then click **New Github App** to start configuring a new Github app for your dbt Cloud instance. - -Enter the following values, replacing `` with the hostname of your dbt Cloud installation. - -- GitHub App name: `dbt Cloud` -- Homepage URL: `https://www.getdbt.com` -- User authorization callback URL: `https:///complete/github` -- Setup URL: `https:///#/profile/integrations` -- Webhook URL: `https:///api/v1/webhooks/github/` - -Under Permissions, select the following: - -- **Checks**: Read & write -- **Repository contents**: Read & write -- **Repository metadata**: Read-only -- **Pull requests**: Read-only -- **Commit statuses**: Read & write - -Under Subscribe to events, select the following: - -- **Pull request** -- **Push** - -Save the new application. - -During dbt Cloud setup, the installer will need to provide credentials for this application to the dbt Cloud configuration console. - -They will need the following: - -- the base URL of your Github enterprise installation, e.g. github.mycompany.com or github.com (if you're a cloud-hosted) -- the scheme-included API URL of your Github enterprise installation. Usually https://github.mycompany.com/api/v3 or https://api.github.com (if you're cloud-hosted) -- the App ID, Client ID, and Client Secret from the "About" page of the newly created Github app -- the Configuration URL for your Github app: right click "Install app" on the "About" page, click "Copy Link Location" to get the Configuration URL -- the Install URL: right click "Public page" on the "About" page, click "Copy Link Location," and paste in the value here. -- the Private Key PEM: : on the "About" page of the newly created Github app, scroll down to the bottom of the page. Under "Private Keys," click "Generate a Private Key." Download the key and provide it to the installer. - -### Slack - -dbt Cloud can send Slack notifications when runs finish. You can provide a set of Slack credentials for dbt Cloud to use to interact with a Slack app you create. - -### Datadog - -dbt Cloud can be configured to send logs and metrics to your Datadog account. This requires configuration and installation of a [Datadog agent](https://www.datadoghq.com/blog/monitoring-kubernetes-with-datadog/#install-the-datadog-agent). - -### Google SSO - -If running dbt Cloud in GCP with Private Google Access enabled, SSO can be configured either using the default googleapis.com URL or setting a custom one. - -### Okta SSO - -dbt Cloud can be configured to integrate with Okta SSO by providing a public SAML certificate and corresponding private key. The key pair can be generated using an openssl command such as the below. - -```bash -openssl req -new -x509 -days 3652 -nodes -out saml.crt -keyout saml.key -``` diff --git a/website/docs/docs/dbt-cloud/on-premises/setup.md b/website/docs/docs/dbt-cloud/on-premises/setup.md deleted file mode 100644 index 654722192ef..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/setup.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -id: setup -title: Setup ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -## First-time Setup - -### Configuration Console - -dbt Cloud ships with a configuration console that lets you self-manage settings, view the status of your deployment, and automatically install new versions of the software. If you are not sure how to access the configuration console (a.k.a kotsadm UI), go back to the appropriate installation section. You will be prompted for the password created in this section the first time you access the console. - -Note that the version of the configuration console installation can be viewed on the bottom of the UI. - - - -This version corresponds with the kots installation from the previous step and can alternatively be viewed by running the following command. - -```bash -kubectl kots version -``` - -This version should be kept up to date with the latest release which will have the latest patches and bug fixes. Detailed instructions on how upgrade the configuration console will be published soon but in the mean time if you need to update the console [contact your account manager or support](mailto:support@getdbt.com). The kots release notes can be found [here](https://kots.io/release-notes/). - -### Self-signed TLS Certificate Warning (Install into a VM only) - -During the installation process, the application bundle will generate a self-signed certificate for connecting securely to the configuration console. You will need to follow the instructions to temporarily trust this self-signed certificate. This self-signed TLS certificate is only used during the initial setup. - - - -Next, you'll be asked to upload a TLS certificate to secure both the configuration console and the application itself. If you have not already generated this certificate, you should do so now, otherwise your users will see a warning in their browser every time they try to access the application. Note that this hostname must be a DNS name, and not an IP address, for routing to work properly. In addition, you will need to create a DNS record from your desired hostname to this Linux instance to route traffic via DNS. - - - -Enter the desired hostname, and upload your TLS certificate to establish secure connections to this host. Then, press "Upload & Continue" to continue with setup. - -### Upload License - -The first time you log into the configuration console, you will need to upload your license file. This contains information about your subscription, as well as configuration for your specific installation. If you don't already have a license file, [contact your account manager or support](mailto:support@getdbt.com). - - - -After you upload the license, you will be redirected to the Config page. You can access this page at any time to reconfigure your dbt Cloud installation. - -### Configure the Application - -On the Config screen, you'll be prompted to provide configuration details for your dbt Cloud application. Follow the instructions on this page to configure the application. Most of the configuration values should be established from the [prerequisites section](/docs/dbt-cloud/on-premises/prerequisites). If anything is missing, please contact sales or the person on your team that set up the prerequisites. - - - -### Deploy the Application - -After configuring the application, you will be taken to the **Version History** page where you can manage which version of dbt Cloud is deployed. A series of preflight checks will run on your newly configured version. If everything is configured correctly, it will say "Ready to Deploy." Click Deploy to start the application. - - - -You can skip to the Deploying Application Updates section below to learn more about how dbt Cloud deployment management works. - -### Create a dbt Cloud Account - -Within your dbt Cloud installation, you can create multiple Accounts that can have different user groups assigned to them. To create your first account, navigate to the dbt Cloud backend at `https:///backend`. This is an administrative site you can use to manage the dbt Cloud application. You can login with username "superuser" and the Django Superuser Password you entered in the configuration console. - -After logging in, you can create a new account and invite members of your team, by doing the following: - -- Under Database > Accounts, click + Add -- Enter a name for the account -- Under Concurrency, click Show. Enter how many concurrent jobs to allow for this account. If you aren't sure, pick 1 for now. -- Add users to the account (this is important -- if you don't add users, nobody will be able to access the account!) -- If using Enterprise SSO, follow the integration-specific guide for setting up this account. -- If using username / email login, you can invite users to this account by email from this page. Under Invites, enter the email address(es) of users you'd like to invite. When you save the account, these users will receive an invitation in their inbox. -- Save the new account. - -### Account / User Concepts - -Accounts and Users are separate constructs that can have a many-to-many relationship. When creating a new Account, you can add either existing or new users to the account. If one user has access to multiple accounts, they will be able to switch accounts from the dbt Cloud frontend upon login. - -Each user can have a specific role on each account. For more information on each role, please see the docs on [managing permissions](/docs/collaborate/manage-access/about-access) - -### Deploying Application Updates - -A new version of dbt Cloud will appear on the Version History page in your Configuration Console anytime any of the following happen: - -- A new version of the dbt Cloud code is released. This typically happens every two weeks, and each new version will be accompanied by a [changelog](/docs/dbt-versions/dbt-cloud-release-notes). -- Any configuration change is applied to your application via the Configuration Console. -- Anytime an edit is applied to your Kubernetes configs via the overlays mechanism built into kots. - - - -You can apply or roll back these changes at any time by clicking the Deploy and Rollback buttons on the right side of this screen. - - - -### Github Setup - -This setup is only required for Github App usage. Please reference the [Prerequisites](dbt-cloud/on-premises/prerequisites#github) for necessary information. - -1. Log into the dbt Cloud configuration console, navigate to Settings, and check Enable under Github Integration. - -2. Enter the following: - -- Base URL: the base URL of your Github enterprise installation, e.g. github.mycompany.com -- API URL: the scheme-included API URL of your Github enterprise installation. Usually https://github.mycompany.com/api/v3 -- App ID: copy from the "About" page of the newly created Github app -- Client ID: copy from the "About" page of the newly created Github app -- Client Secret: copy from the "About" page of the newly created Github app -- Configuration URL for your Github app: right click "Install app" on the "About" page, click "Copy Link Location," and paste in the value here. -- Install URL: right click "Public page" on the "About" page, click "Copy Link Location," and paste in the value here. -- Private Key PEM: on the "About" page of the newly created Github app, scroll down to the bottom of the page. Under "Private Keys," click "Generate a Private Key." Download the key, then open up the file and paste the contents into this field. - -3. Save your dbt Cloud configuration. -4. Restart the "app" pod to update the configuration values. -##### To do this on kubernetes 1.15+: -``` -kubectl rollout restart deployment app -``` -##### To do this on kubernetes <1.15: -``` -kubectl patch deployment app -p \ - "{\"spec\":{\"template\":{\"metadata\":{\"annotations\":{\"date\":\"`date +'%s'`\"}}}}}" -``` - -After these steps, you are ready to manage your Github integration. Your users can log into dbt Cloud, and navigate to Profile > Integrations to start connecting your GitHub account to dbt Cloud. See [GitHub](/docs/collaborate/git/connect-github) for more details on how your users can start using the integration. diff --git a/website/docs/docs/dbt-cloud/on-premises/system-requirements.md b/website/docs/docs/dbt-cloud/on-premises/system-requirements.md deleted file mode 100644 index aa63c064be1..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/system-requirements.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -id: system-requirements -title: System Requirements ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -A dbt Cloud installation requires at least: - -- 4 CPU cores -- 16GB of memory -- 100GB of available disk storage for logs and metadata - -For every 20 developer users, it is recommended to add another 4 CPU cores, 16 GB of memory, and 100GB of disk. You can accomplish this either by creating a larger instance, or by adding another node to your dbt Cloud cluster. - -### Install into an existing Kubernetes cluster - -As mentioned above, it is recommended that at least 4 CPU cores, 16GB of memory, and 100GB of storage is made available for the dbt Cloud application. These are the resource totals consumed across the entire Kubernetes cluster by the dbt Cloud application. - -### Install into a VM - -#### Supported Operating Systems - -Our single-line installer requires a Linux VM. It is recommended that you use **Ubuntu 18.04** as the base operating system. Other supported base operating systems include: - -- Ubuntu 16.04 -- RHEL 7.4, 7.5, 7.6, 7.7 -- CentOS 7.4, 7.5, 7.6, 7.7 - -#### Disk - -The disk you provision should be at least 100GB. `ext4` is the recommended filesystem. Other filesystems **except `xfs`** are supported[[1](https://github.com/rook/rook/blob/master/Documentation/ceph-common-issues.md#a-worker-node-using-rbd-devices-hangs-up)]. - -#### Cloud Providers - -For major cloud providers, use the following instance types or larger for a starter deployment. - -- **GCP**: n1-standard-4 -- **AWS**: m5.xlarge -- **Azure**: D4 v3 diff --git a/website/docs/docs/dbt-cloud/on-premises/upgrading-kots.md b/website/docs/docs/dbt-cloud/on-premises/upgrading-kots.md deleted file mode 100644 index c0b26d5ba33..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/upgrading-kots.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -id: upgrading-kots -title: Upgrading KOTS ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -## Overview - -As explained in the [dependencies](/docs/dbt-cloud/on-premises/dependencies) section, customer managed deployments rely on the Kubernetes-Off-The-Shelf (KOTS) appliance to configure and deploy the dbt Cloud application. When installing the dbt Cloud application for the first time as described in the [installation](/docs/dbt-cloud/on-premises/installation) section, the latest version of KOTS will be downloaded and installed along with the Admin Console. The KOTS version should be periodically upgraded to stay current with the [latest release](https://kots.io/release-notes/) - -To see the current installed version of KOTS the following command can be run: - -```bash -kubectl kots version -``` - -The KOTS version is also visible at the bottom of the Admin Console UI next to "Terms" and "Privacy". - - - -For newer versions of KOTS, the following command may be run to upgrade your installation to the latest stable version in place: - -```bash -kubectl kots admin-console upgrade -n -``` - -If you are upgrading your KOTS installation and have not tried the previous command, try this first before proceeding to the following section as manually upgrading may not be necessary. If the current installed version of KOTS is too old the upgrade command may fail and then the following instructions should be followed to manually upgrade the KOTS installation. - -## Manually upgrading the KOTS installation - -This section describes the steps needed to manually upgrade KOTS by deleting the current installation and re-installing the latest version. This process necessitates updating dbt Cloud to the latest stable version and as such is it is recommended that this process be executed along with a planned dbt Cloud upgrade. This process will not cause any downtime of the dbt Cloud application. - -Note: This process will erase the Version History in the Admin Console and force you to update to the latest stable version (meaning that you will lose the ability to roll back to prior versions). If you need to deploy a previous version of dbt Cloud please reach out to the dbt Labs team. - -#### Prerequisites - -The following prerequisites are required prior to starting the manual upgrade process. - -- **Latest KOTS Plugin**: If you are deploying into an existing Kubernetes cluster, make sure the latest kots plugin is installed locally by running the following command. - -```bash -curl https://kots.io/install | bash -``` - -If deploying into a VM, make sure the VM has the latest version installed by running the same command. - -- **Backup configuration**: This process will generate a backup of your configuration, but it is _always_ a good idea to create a manual backup of your dbt Cloud configuration (especially the encryption settings and database settings, which can cause dbt Cloud to break if lost or not entered correctly). - -- **License file**: You will need to re-upload your license file when re-installing. If you do not have your license file a new one can be generated by reaching out to the dbt Labs team. - -#### Download the existing KOTS configuration - -The following command will download the current application manifests from the Kubernetes cluster, including config values entered into the Admin Console. - -```bash -kubectl kots download dbt-cloud-v1 --namespace --dest ./dist -``` - -In older versions of KOTS, the config values will be stored at ```./dist/upstream/userdata/config.yaml``` while newer versions store the config values at ```./dist/dbt-cloud-v1/upstream/userdata/config.yaml```. - -This file will be used later when re-installing KOTS to restore the previous configuration. For more information about the kots download see [here](https://kots.io/kots-cli/download/). - -Note: In newer versions of KOTS the ```--decrypt-password-values``` flag can be set to decrypt password values. If this flag is not set the password values in the downloaded config file will be encrypted. As such, all secret values should be manually backed up as described in the prerequisites section. - -#### Delete all 'kots' and 'kotsadm' Kubernetes objects - -Deleting all of the 'kots' and 'kotsadm' Kubernetes objects will allow us to perform a clean slate install of the KOTS plugin. The following script will delete these objects: - -```bash -kubectl delete deployment kotsadm kotsadm-api kotsadm-operator -kubectl delete statefulset kotsadm-minio kotsadm-postgres -kubectl delete pvc kotsadm-minio-kotsadm-minio-0 \ - kotsadm-postgres-kotsadm-postgres-0 -kubectl delete secret kotsadm-authstring kotsadm-cluster-token \ - kotsadm-encryption kotsadm-minio kotsadm-password kotsadm-postgres \ - kotsadm-replicated-registry kotsadm-session -kubectl delete configmap kotsadm-application-metadata -``` - -Note: In certain KOTS installations some of the resources in this script may not be present and create error messages in the terminal. This is fine and will not impact the deletion of other resources. - -#### Re-install KOTS and upload the saved configuration - -Once all the 'kots' and 'kotsadm' Kubernetes objects are deleted we can re-install KOTS with the latest version using the following two commands. The first command will make sure the latest kots release is downloaded and the second will re-install KOTS into the desired namespace with the pre-populated config values from the previous step. The second command will also prompt the user to enter a password for the Admin Console. This does not have to be the same password as previously entered for the Admin Console. - -```bash -curl https://kots.io/install | bash -kubectl kots install dbt-cloud-v1 --namespace \ - --config-values "./dist/upstream/userdata/config.yaml" -``` - -Note that the config values path may need modified based on the current version of KOTS that is being upgraded from. - -After the KOTS installer finishes, login to the Admin Console and verify that the config values are populated correctly in the console. Once verified click continue to proceed to the preflight checks. Once the preflight checks run successfully click continue again to deploy the latest stable version of dbt Cloud. - -#### Re-install kURL proxy (VM only) - -If running dbt Cloud in a VM (not recommended) the kURL installer will need to be run to re-install the kURL proxy to serve up the Admin Console. - -Note: Before running this step it is recommended that you reach out to dbt Labs as a slightly different version of this command may be needed depended on your license channel. - -```bash -curl -sSL https://k8s.kurl.sh/dbt-cloud-v1 | sudo bash -``` diff --git a/website/docs/docs/dbt-cloud/on-premises/usage-statistics.md b/website/docs/docs/dbt-cloud/on-premises/usage-statistics.md deleted file mode 100644 index aa7759aa4b1..00000000000 --- a/website/docs/docs/dbt-cloud/on-premises/usage-statistics.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -id: usage-statistics -title: Usage Statistics ---- - -:::note - -We longer support new on-premises deployments, and instead have moved to a [Single Tenant](single-tenant) model hosted in the cloud - -::: - -## Overview - -On-premises dbt Cloud deployments send high-level dbt Cloud metadata -for an installation back to dbt Labs. dbt Labs uses this -information to record license utilization and better assist in supporting -on-premises deployments of dbt Cloud. - -Usage statistics are tracked once weekly, and include the following information: - - The account ids and account names present in a deployment - - The number of developer and read only licenses utilized in each account - - The version of dbt Cloud installed in the on-premises environment - -This information is sent as a payload to usage.getdbt.com. A typical -payload looks like: - -```json -{ - "accounts": [ - "id": 1, - "name": "dbt Labs", - "develolper_licenses": 20, - "read_only_licenses": 50, - "dbt_cloud_version": "1.0.0" - ] -} -``` - -## Allow outbound traffic - -To enable the collection of this metadata for your deployment, please allow -outbound http traffic to https://usage.getdbt.com from your dbt Cloud installation. diff --git a/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab.md b/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab.md deleted file mode 100644 index 8f09d9dfb34..00000000000 --- a/website/docs/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: "Model timing tab" -id: "cloud-model-timing-tab" -description: "Visually explore runs and identify long-running models with the model timing dashboard." ---- - -### Overview - -:::info Model timing -Model Timing is only available on the Team and Multi-tenant Enterprise plans -::: - -Accessed via the "run detail" page in dbt Cloud, the model timing dashboard displays the model composition, order, and run time for every job run in dbt Cloud. The top 1% of model durations are automatically highlighted for quick reference. This visualization is displayed after the run completes. - -This is a very visual way to explore your run and surface model bottlenecks. Longest running models *may* be ripe for further exploration -- which can lead to refactoring or reducing run cadence. - -Notes: -- The model timing dashboard is currently only available on multi-tenant Team and Enterprise accounts. -- The model timing dashboard can only be viewed for jobs that have successfully completed. - - diff --git a/website/docs/docs/dbt-support.md b/website/docs/docs/dbt-support.md index a24b00ffb90..513d5fff588 100644 --- a/website/docs/docs/dbt-support.md +++ b/website/docs/docs/dbt-support.md @@ -1,43 +1,49 @@ --- title: "dbt support" id: "dbt-support" +pagination_next: null +pagination_prev: null --- ## dbt Core support -If you're developing in the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). +If you're developing on the command line (CLI) and have questions or need some help — reach out to the helpful dbt community through [the Community Forum](https://discourse.getdbt.com/) or [dbt Community slack](https://www.getdbt.com/community/join-the-community/). ## dbt Cloud support -We want to help you work through implementing and utilizing dbt Cloud at your organization. Have a question you can't find an answer to in [our docs](https://docs.getdbt.com/) or [the Community Forum](https://discourse.getdbt.com/)? dbt Support is here to `dbt help` you! -Check out our guide on [getting help](/guides/legacy/getting-help) - half of the problem is often knowing where to look... and how to ask good questions! -Types of questions dbt Support will assist you with: +We want to help you work through implementing and utilizing dbt Cloud at your organization. Have a question you can't find an answer to in [our docs](https://docs.getdbt.com/) or [the Community Forum](https://discourse.getdbt.com/)? Our Support team is here to `dbt help` you! +Check out our guide on [getting help](/community/resources/getting-help) - half of the problem is often knowing where to look... and how to ask good questions! + +Types of dbt Cloud-related questions our Support team can assist you with, regardless of your dbt Cloud plan: - **How do I...** - set up a dbt Cloud project? - set up a private package in dbt Cloud? - configure custom branches on git repos? - link dbt to a new github account? - **Help! I can't...** - - log in! - - access logs! - - update user groups! + - log in. + - access logs. + - update user groups. +- **I need help understanding...** + - why this run failed. + - why I am getting this error message in dbt Cloud. + - why my CI jobs are not kicking off as expected. + + +### dbt Cloud Enterprise accounts + +For customers on a dbt Cloud Enterprise plan, we **also** offer basic assistance in troubleshooting issues with your dbt project. - **Something isn't working the way I would expect it to...** - in a macro I created... - in an incremental model I'm building... - in one of dbt Labs' packages like dbt_utils or audit_helper... - **I need help understanding and troubleshooting this error...** - - `Server error: Compilation Error in rpc request (from remote system) 'dbt_utils' is undefined` - - `SQL compilation error: syntax error line 1 at position 38 unexpected ''.` - - `Compilation Error Error reading name_of_folder/name_of_file.yml - Runtime Error Syntax error near line 9` - -### dbt Cloud Enterprise accounts - Types of questions you should ask your Solutions Architect and Sales Director: - How should we think about setting up our dbt projects, environments, and jobs based on our company structure and needs? - I want to expand my account! How do I add more people and train them? @@ -45,4 +51,4 @@ Types of questions you should ask your Solutions Architect and Sales Director: - It is time for our contract renewal, what options do I have? -When you need help writing SQL or want someone to actually help build your dbt Project, check out our list of [dbt Preferred Consulting Providers](https://www.getdbt.com/ecosystem/) or our [Services](https://www.fishtownanalytics.com/professional-services/) page! +When you need help writing SQL, reviewing the overall performance of your project, or want someone to actually help build your dbt project, check out our list of [dbt Preferred Consulting Providers](https://www.getdbt.com/ecosystem/) or our [Services](https://www.getdbt.com/dbt-labs/services/) page! diff --git a/website/docs/docs/dbt-versions/core-versions.md b/website/docs/docs/dbt-versions/core-versions.md index 3ba6bb75375..5e8e437f0b1 100644 --- a/website/docs/docs/dbt-versions/core-versions.md +++ b/website/docs/docs/dbt-versions/core-versions.md @@ -2,35 +2,51 @@ title: "About dbt Core versions" id: "core" description: "Learn about semantic versioning for dbt Core, and how long those versions are supported." +pagination_next: "docs/dbt-versions/upgrade-core-in-cloud" +pagination_prev: null --- -dbt Core releases follow [semantic versioning](https://semver.org/). The policies and expectations on this page assume prior familiarity with semantic versions. For more on how we use semantic versions, see [How dbt Core uses semantic versioning](#how-dbt-core-uses-semantic-versioning). +dbt Core releases follow [semantic versioning](https://semver.org/) guidelines. For more on how we use semantic versions, see [How dbt Core uses semantic versioning](#how-dbt-core-uses-semantic-versioning). + +dbt Labs provides different support levels for different versions, which may include new features, bug fixes, or security patches: + + + + + ### Further reading - To learn how you can use dbt Core versions in dbt Cloud, see [Choosing a dbt Core version](/docs/dbt-versions/upgrade-core-in-cloud). -- To learn about installing dbt Core, see "[How to install dbt Core](/docs/get-started/installation)." -- To restrict your project to only work with a range of dbt Core versions, or use the currently running dbt Core version, see [`require-dbt-version`](require-dbt-version) and [`dbt_version`](dbt_version). +- To learn about installing dbt Core, see "[How to install dbt Core](/docs/core/installation)." +- To restrict your project to only work with a range of dbt Core versions, or use the currently running dbt Core version, see [`require-dbt-version`](/reference/project-configs/require-dbt-version) and [`dbt_version`](/reference/dbt-jinja-functions/dbt_version). ## Version support prior to v1.0 -- We are no longer releasing new patches for minor versions prior to v1.0. -- As of June 30, 2022, dbt Cloud will remove support for dbt Core versions older than v1.0. At that point, we will also remove v0.20 and v0.21 from the version dropdown on this website. -- You can read the [specific version migration guides](/guides/migration/versions) to understand changes to each version. Each migration guide will link to pages of documentation that were added or updated. Those pages of documentation will also include "Changelog" notes, which you can toggle to see notes on specific changes from each older version. +All dbt Core versions released prior to 1.0 and their version-specific documentation have been deprecated. If upgrading to a currently supported version, reference our [best practices for upgrading](#best-practices-for-upgrading) + +## EOL version support + +All dbt Core minor versions that have reached end-of-life (EOL) will have no new patch releases. This means they will no longer receive any fixes, including for known bugs that have been identified. Fixes for those bugs will instead be made in newer minor versions that are still under active support. + +We recommend upgrading to a newer version in [dbt Cloud](/docs/dbt-versions/upgrade-core-in-cloud) or [dbt Core](/docs/core/installation#upgrading-dbt-core) to continue receiving support. + +All dbt Core v1.0 and later are available in dbt Cloud until further notice. In the future, we intend to align dbt Cloud availability with dbt Core ongoing support. You will receive plenty of advance notice before any changes take place. + -## Version support starting with v1.0 +## Current version support -### Minor version support +### Minor versions -Minor versions include new features and capabilities. They will be supported for one year (12 months) from the date of their initial release. _This is a definite commitment._ Our mechanism for continuing to support a minor version is by releasing new patches: small, targeted bug fixes. Whenever we refer to a minor version, such as v1.0, we always mean its latest available patch release (v1.0.x). +Minor versions include new features and capabilities. They will be supported for one year from their initial release date. _dbt Labs is committed to this 12-month support timeframe._ Our mechanism for continuing to support a minor version is by releasing new patches: small, targeted bug fixes. Whenever we refer to a minor version, such as v1.0, we always mean its latest available patch release (v1.0.x). While a minor version is officially supported: -- You can use it in dbt Cloud. For more on dbt Cloud versioning, see [Choosing a dbt version](cloud-choosing-a-dbt-version). +- You can use it in dbt Cloud. For more on dbt Cloud versioning, see [Choosing a dbt version](/docs/dbt-versions/upgrade-core-in-cloud). - You can select it from the version dropdown on this website, to see documentation that is accurate for use with that minor version. ### Ongoing patches -During the 12 months of ongoing support, we will continue to release new patch versions that include fixes. +During the 12-month support window, we will continue to release new patch versions that include fixes. **Active Support:** In the first few months after a minor version's initial release, we will patch it with "bugfix" releases. These will include fixes for regressions and net-new bugs that were present in the minor version's original release. @@ -42,12 +58,6 @@ After a minor version reaches the end of its critical support period, one year a We aim to release a new minor "feature" every 3 months. _This is an indicative timeline ONLY._ For the latest information about upcoming releases, including their planned release dates and which features and fixes might be included in each, always consult the [`dbt-core` repository milestones](https://github.com/dbt-labs/dbt-core/milestones). - - -:::warning ⚠️ v0.X Non-Supported Period -We are giving accounts until the end of June 2022 to upgrade to dbt 1.0 or later. Pre-dbt 1.0 versions will not receive patch fixes, and our support team will no longer assist with dbt version specific help on non-supported versions of dbt. Additionally, jobs running dbt versions prior to 1.0 will start experiencing service disruptions before the end of the year 2022 and will be removed from the dbt Cloud context by end of the year 2022. You will receive additional email and in app notification before disruption to your production jobs. -::: - ## Best practices for upgrading Because of our new version practice, we've outlined best practices and expectations for dbt users to upgrade as we continue to release new versions of dbt Core. @@ -58,7 +68,7 @@ We expect users to upgrade to patches as soon as they're available. When we refe ### Upgrading to new minor versions -You may continue to use any minor version of dbt while it is officially supported. During that period, it will remain available in dbt Cloud, and in the version dropdown on this website. While we do not expect users to immediately upgrade to newer minor versions as soon as they're available, there will always be some features and fixes that are only available for users of the latest minor version. +During the official support period, minor versions will remain available in dbt Cloud and the version dropdown on the docs site. While we do not expect users to immediately upgrade to newer minor versions as soon as they're available, there will always be some features and fixes only available for users of the latest minor version. ### Trying prereleases @@ -68,31 +78,37 @@ All dbt Core versions are available as _prereleases_ before the final release. " Like many software projects, dbt Core releases follow [semantic versioning](https://semver.org/), which defines three types of version releases. -- **Major versions:** To date, dbt Core has had one major version release: v1.0.0. When v2.0.0 is released, it will introduce new features and break backwards compatibility for functionality that has been deprecated. +- **Major versions:** To date, dbt Core has had one major version release: v1.0.0. When v2.0.0 is released, it will introduce new features, and functionality that has been announced for deprecation will stop working. - **Minor versions**, also called "feature" releases, include a mix of new features, behind-the-scenes improvements, and changes to existing capabilities that are **backwards compatible** with previous minor versions. They will not break code in your project that relies on documented functionality. - **Patch versions**, also called "bugfix" or "security" releases, include **fixes _only_**. These fixes could be needed to restore previous (documented) behavior, fix obvious shortcomings of new features, or offer critical fixes for security or installation issues. We are judicious about which fixes are included in patch releases, to minimize the surface area of changes. We are committed to avoiding breaking changes in minor versions for end users of dbt. There are two types of breaking changes that may be included in minor versions: -- Changes to the [Python interface for adapter plugins](/guides/advanced/adapter-development/3-building-a-new-adapter). These changes are relevant _only_ to adapter maintainers, and they will be clearly communicated in documentation and release notes. -- Changes to metadata interfaces, including [artifacts](dbt-artifacts) and [logging](events-logging), signalled by a version bump. Those version upgrades may require you to update external code that depends on these interfaces, or to coordinate upgrades between dbt orchestrations that share metadata, such as [state-powered selection](understanding-state). + +- Changes to the [Python interface for adapter plugins](/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter). These changes are relevant _only_ to adapter maintainers, and they will be clearly communicated in documentation and release notes. +- Changes to metadata interfaces, including [artifacts](/docs/deploy/artifacts) and [logging](/reference/events-logging), signalled by a version bump. Those version upgrades may require you to update external code that depends on these interfaces, or to coordinate upgrades between dbt orchestrations that share metadata, such as [state-powered selection](/reference/node-selection/syntax#about-node-selection). ### How we version adapter plugins -When you use dbt, you're using `dbt-core` together with an adapter plugin specific to your database. You can see the current list in [Supported Data Platforms](supported-data-platforms). Both `dbt-core` and dbt adapter plugins follow semantic versioning. +When you use dbt, you use a combination of `dbt-core` and an adapter plugin specific to your database. You can see the current list in [Supported Data Platforms](/docs/supported-data-platforms). Both `dbt-core` and dbt adapter plugins follow semantic versioning. -`dbt-core` and adapter plugins coordinate new features and behind-the-scenes changes in minor releases. When it comes to fixing bugs, sooner is better—so patch versions are released independently for `dbt-core` and plugins. +`dbt-core` and adapter plugins coordinate new features and behind-the-scenes changes in minor releases. When fixing bugs, sooner is better, so patch versions are released independently for `dbt-core` and plugins. -What does that mean? Patch version numbers are likely to be different between `dbt-core` and the adapter plugin(s) you have installed. Major and minor version numbers should always match. +That means that patch version numbers will likely differ between `dbt-core` and the adapter plugin(s) you have installed. However, major and minor version numbers should always match. -As an example, on March 1, you may find you're using `dbt-core==1.0.3` with `dbt-snowflake==1.0.0`. The most important thing is that you're using the latest patch available for each (v1.0.x). If you're running dbt locally, you can use the `dbt --version` command to see which versions you have installed: +For example, you may find you're using `dbt-core==1.6.0` with `dbt-snowflake==1.6.0`. It is critical that you're using the latest patch available for both core and the adapter. Use the `dbt --version` command to see which versions you have installed: ``` $ dbt --version -installed version: 1.0.3 - latest version: 1.0.3 - -Up to date! +Core: + - installed: 1.6.0 + - latest: 1.6.0 - Up to date! Plugins: - - snowflake: 1.0.0 - Up to date! + - snowflake: 1.6.0 - Up to date! ``` + +You can see which version of the registered adapter that's being invoked in the [logs](/reference/global-configs/logs). Below is an example of the message in the `logs/dbt.log` file: +``` +[0m13:13:48.572182 [info ] [MainThread]: Registered adapter: snowflake=1.6.0 +``` + It's likely that newer patches have become available since then, so it's always important to check and make sure you're up to date! diff --git a/website/docs/docs/dbt-versions/experimental-features.md b/website/docs/docs/dbt-versions/experimental-features.md new file mode 100644 index 00000000000..a621bd4ac44 --- /dev/null +++ b/website/docs/docs/dbt-versions/experimental-features.md @@ -0,0 +1,28 @@ +--- +title: "Preview new and experimental features in dbt Cloud" +id: "experimental-features" +sidebar_label: "Preview new dbt Cloud features" +description: "Gain early access to many new dbt Labs experimental features by enabling this in your profile." +pagination_next: null +--- + +dbt Labs often tests experimental features before deciding to continue on the [Product lifecycle](https://docs.getdbt.com/docs/dbt-versions/product-lifecycles#dbt-cloud). + +You can access experimental features to preview beta features that haven’t yet been released to dbt Cloud. You can toggle on or off all experimental features in your Profile settings. Experimental features: + +- May not be feature-complete or fully stable as we’re actively developing them. +- Could be discontinued at any time. +- May require feedback from you to understand their limitations or impact. Each experimental feature collects feedback directly in dbt Cloud, which may impact dbt Labs' decisions to implement. +- May have limited technical support and be excluded from our Support SLAs. +- May not have public documentation available. + +To enable or disable experimental features: + +1. Navigate to **Profile settings** by clicking the gear icon in the top right. +2. Find Experimental features at the bottom of Your Profile page. +3. Click **Beta** to toggle the features on or off as shown in the following image. + ![Experimental features](/img/docs/dbt-versions/experimental-feats.png) + +## Beta terms and conditions + +By using or enabling features that are not yet in general release ("Beta Features"), you agree to the [Beta Features Terms and Conditions](/assets/beta-tc.pdf). diff --git a/website/docs/docs/dbt-versions/product-lifecycles.md b/website/docs/docs/dbt-versions/product-lifecycles.md index f676c6af2eb..71f33110eb2 100644 --- a/website/docs/docs/dbt-versions/product-lifecycles.md +++ b/website/docs/docs/dbt-versions/product-lifecycles.md @@ -14,7 +14,7 @@ Any dbt feature will fall into one of the following lifecycle states: ### dbt Cloud -- **Beta:** Beta features may be made available for the purpose of customer testing and evaluation. These may not be feature-complete or fully stable. There may still be some planned additions and modifications to product behaviors while in Beta. Breaking changes may occur – although we will do our best to communicate them in advance, we may not always be able to do so. Beta features may not be fully documented, technical support may be limited, and service level objectives (SLOs) may not be provided. +- **Beta:** Beta features may be made available for the purpose of customer testing and evaluation. These might not be feature-complete or fully stable. There might still be some planned additions and modifications to product behaviors while in beta. Breaking changes could occur — although we will do our best to communicate them in advance, we might not always be able to do so. Beta features might not be fully documented, technical support might be limited, and service level objectives (SLOs) might not be provided. Download the [Beta Features Terms and Conditions](/assets/beta-tc.pdf) for more details. - **Preview (Private or Public):** Preview features are stable and can be considered for production deployments. There may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality to Preview features that is not backward compatible. Preview features include documentation, technical support, and include service level objectives (SLOs). Features in Preview are generally provided at no extra cost, although they may become paid features in their Generally Available state. diff --git a/website/docs/docs/dbt-versions/release-notes.md b/website/docs/docs/dbt-versions/release-notes.md index db25af163ae..6f7be90e60d 100644 --- a/website/docs/docs/dbt-versions/release-notes.md +++ b/website/docs/docs/dbt-versions/release-notes.md @@ -2,6 +2,8 @@ title: "About dbt Cloud Release Notes" id: "dbt-cloud-release-notes" description: "Release notes for dbt Cloud" +pagination_next: null +pagination_prev: null --- dbt provides release notes for dbt Cloud so you can see recent and historical changes. Generally, you'll see release notes for these changes: diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md new file mode 100644 index 00000000000..d96b82636f8 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/cloud-cli-pp.md @@ -0,0 +1,31 @@ +--- +title: "New: dbt Cloud CLI in Public Preview" +description: "October 2023: Learn about the new dbt Cloud CLI development experience, now in public preview," +sidebar_position: 04 +sidebar_label: "New: dbt Cloud CLI in Public Preview" +tags: [Oct-2023, CLI, dbt Cloud] +date: 2023-10-17 +--- + +We are excited to announce the dbt Cloud CLI, **unified command line for dbt**, is available in public preview. It’s a local development experience, powered by dbt Cloud. It’s easy to get started: `pip3 install dbt` or `brew install dbt` and you’re ready to go. + +We will continue to invest in the dbt Cloud IDE as the easiest and most accessible way to get started using dbt, especially for data analysts who have never developed software using the command line before. We will keep improving the speed, stability, and feature richness of the IDE, as we have been [all year long](https://www.getdbt.com/blog/improvements-to-the-dbt-cloud-ide/). + +We also know that many people developing in dbt have a preference for local development, where they can use their favorite terminal, text editor, keybindings, color scheme, and so on. This includes people with data engineering backgrounds, as well as those analytics engineers who started writing code in the dbt Cloud IDE and have expanded their skills. + +The new dbt Cloud CLI offers the best of both worlds, including: + +- The power of developing against the dbt Cloud platform +- The flexibility of your own local setup + +Run whichever community-developed plugins, pre-commit hooks, or other arbitrary scripts you like. + +Some of the unique capabilities of this dbt Cloud CLI include: + +- Automatic deferral of build artifacts to your Cloud project's production environment +- Secure credential storage in the dbt Cloud platform +- Support for dbt Mesh ([cross-project `ref`](/docs/collaborate/govern/project-dependencies)) +- Development workflow for dbt Semantic Layer +- Speedier, lower cost builds + +Refer to [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) to learn more. diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md new file mode 100644 index 00000000000..06550b7d863 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/custom-branch-fix-rn.md @@ -0,0 +1,14 @@ +--- +title: "Fix: Default behavior for CI job runs without a custom branch" +description: "October 2023: CI job runs now default to the main branch of the Git repository when a custom branch isn't set" +sidebar_label: "Fix: Default behavior for CI job runs without a custom branch" +tags: [Oct-2023, CI] +date: 2023-10-06 +sidebar_position: 08 +--- + +If you don't set a [custom branch](/docs/dbt-cloud-environments#custom-branch-behavior) for your dbt Cloud environment, it now defaults to the default branch of your Git repository (for example, `main`). Previously, [CI jobs](/docs/deploy/ci-jobs) would run for pull requests (PRs) that were opened against _any branch_ or updated with new commits if the **Custom Branch** option wasn't set. + +## Azure DevOps + +Your Git pull requests (PRs) might not trigger against your default branch if you're using Azure DevOps and the default branch isn't `main` or `master`. To resolve this, [set up a custom branch](/faqs/Environments/custom-branch-settings) with the branch you want to target. diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md new file mode 100644 index 00000000000..80963a9d550 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/dbt-deps-auto-install.md @@ -0,0 +1,21 @@ +--- +title: "Enhancement: dbt Cloud auto-installs 'dbt deps' on startup" +description: "October 2023 :The dbt Cloud IDE and dbt Cloud CLI auto-handles 'dbt deps' on startup; manual run needed for 'packages.yml' changes. Available for multi-tenant users (single-tenant support coming soon) and applies to all dbt versions." +sidebar_label: "Enhancement: dbt Cloud auto-installs 'dbt deps' on startup" +tags: [Oct-2023, IDE] +date: 2023-10-17 +sidebar_position: 06 +--- + +The dbt Cloud IDE and dbt Cloud CLI now automatically installs `dbt deps` when your environment starts or when necessary. Previously, it would prompt you to run `dbt deps` during initialization. + +This improved workflow is available to all multi-tenant dbt Cloud users (Single-tenant support coming next week) and applies to dbt versions. + +However, you should still run the `dbt deps` command in these situations: + +- When you make changes to the `packages.yml` or `dependencies.yml` file during a session +- When you update the package version in the `packages.yml` or `dependencies.yml` file. +- If you edit the `dependencies.yml` file and the number of packages remains the same, run `dbt deps`. (Note that this is a known bug dbt Labs will fix in the future.) + + + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md new file mode 100644 index 00000000000..ebf5add8d03 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/explorer-public-preview-rn.md @@ -0,0 +1,13 @@ +--- +title: "New: dbt Explorer Public Preview" +description: "October 2023: dbt Explorer is now available in Public Preview. You can use it to understand, improve, and leverage your dbt projects." +sidebar_label: "New: dbt Explorer Public Preview" +tags: [Oct-2023, Explorer] +date: 2023-10-13 +sidebar_position: 07 +--- + +On Oct 17, 2023, a Public Preview of dbt Explorer will become available to dbt Cloud customers. With dbt Explorer, you can view your project's resources (such as models, tests, and metrics) and their lineage — including interactive DAGs — to gain a better understanding of its latest production state. Navigate and manage your projects within dbt Cloud to help you and other data developers, analysts, and consumers discover and leverage your dbt resources. + +For details, refer to [Explore your dbt projects](/docs/collaborate/explore-projects). + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md new file mode 100644 index 00000000000..20e56879940 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/native-retry-support-rn.md @@ -0,0 +1,15 @@ +--- +title: "Enhancement: Native support for the dbt retry command" +description: "October 2023: Rerun errored jobs from start or from the failure point" +sidebar_label: "Enhancement: Support for dbt retry" +tags: [Oct-2023, Scheduler] +date: 2023-10-06 +sidebar_position: 10 +--- + +Previously in dbt Cloud, you could only rerun an errored job from start but now you can also rerun it from its point of failure. + +You can view which job failed to complete successully, which command failed in the run step, and choose how to rerun it. To learn more, refer to [Retry jobs](/docs/deploy/retry-jobs). + + + diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md new file mode 100644 index 00000000000..e669b037d17 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/product-docs-sept-rn.md @@ -0,0 +1,38 @@ +--- +title: "September 2023 product docs updates" +id: "product-docs-sept" +description: "September 2023: The Product docs team merged 107 PRs, made various updates to dbt Cloud and Core, such as GAing continuous integration jobs, Semantic Layer GraphQL API doc, a new community plugin, and more" +sidebar_label: "Update: Product docs changes" +tags: [Sept-2023, product-docs] +date: 2023-10-10 +sidebar_position: 09 +--- + +Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun! First, we’d like to thank the 15 new community contributors to docs.getdbt.com. We merged [107 PRs](https://github.com/dbt-labs/docs.getdbt.com/pulls?q=is%3Apr+merged%3A2023-09-01..2023-09-31) in September. + +Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/): + +* Migrated docs.getdbt.com from Netlify to Vercel. + +## ☁ Cloud projects +- Continuous integration jobs are now generally available and no longer in beta! +- Added [Postgres PrivateLink set up page](/docs/cloud/secure/postgres-privatelink) +- Published beta docs for [dbt Explorer](/docs/collaborate/explore-projects). +- Added a new Semantic Layer [GraphQL API doc](/docs/dbt-cloud-apis/sl-graphql) and updated the [integration docs](/docs/use-dbt-semantic-layer/avail-sl-integrations) to include Hex. Responded to dbt community feedback and clarified Metricflow use cases for dbt Core and dbt Cloud. +- Added an [FAQ](/faqs/Git/git-migration) describing how to migrate from one git provider to another in dbt Cloud. +- Clarified an example and added a [troubleshooting section](/docs/cloud/connect-data-platform/connect-snowflake#troubleshooting) to Snowflake connection docs to address common errors and provide solutions. + + +## 🎯 Core projects + +- Deprecated dbt Core v1.0 and v1.1 from the docs. +- Added configuration instructions for the [AWS Glue](/docs/core/connect-data-platform/glue-setup) community plugin. +- Revised the dbt Core quickstart, making it easier to follow. Divided this guide into steps that align with the [other guides](/quickstarts/manual-install?step=1). + +## New 📚 Guides, ✏️ blog posts, and FAQs + +Added a [style guide template](/guides/best-practices/how-we-style/6-how-we-style-conclusion#style-guide-template) that you can copy & paste to make sure you adhere to best practices when styling dbt projects! + +## Upcoming changes + +Stay tuned for a flurry of releases in October and a filterable guides section that will make guides easier to find! diff --git a/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md new file mode 100644 index 00000000000..5e53363f62a --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/03-Oct-2023/sl-ga.md @@ -0,0 +1,29 @@ +--- +title: "Update: dbt Cloud Semantic Layer is Generally Available" +description: "October 2023: dbt Cloud Semantic Layer is Generally Available for all users" +sidebar_label: "Update: dbt Cloud Semantic Layer is GA" +sidebar_position: 05 +date: 2023-10-17 +tags: [Oct-2023] +--- + +:::important +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher and [migrate](/guides/migration/sl-migration) to the latest Semantic Layer. +::: + +dbt Labs is thrilled to announce that the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) is now generally available. It offers consistent data organization, improved governance, reduced costs, enhanced efficiency, and accessible data for better decision-making and collaboration across organizations. + +It aims to bring the best of modeling and semantics to downstream applications by introducing: + +- Brand new [integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) such as Tableau, Google Sheets, Hex, Mode, and Lightdash. +- New [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) using GraphQL and JDBC to query metrics and build integrations. +- dbt Cloud [multi-tenant regional](/docs/cloud/about-cloud/regions-ip-addresses) support for North America, EMEA, and APAC. Single-tenant support coming soon. +- Use the APIs to call an export (a way to build tables in your data platform), then access them in your preferred BI tool. Starting from dbt v1.7 or higher, you will be able to schedule exports as part of your dbt job. + + + +The dbt Semantic Layer is available to [dbt Cloud Team or Enterprise](https://www.getdbt.com/) multi-tenant plans on dbt v1.6 or higher. +- Team and Enterprise customers can use 1,000 Queried Units per month for no additional cost on a limited trial basis, subject to reasonable use limitations. Refer to [Billing](/docs/cloud/billing#what-counts-as-a-query-unit) for more information. +- dbt Cloud Developer plans and dbt Core users can define metrics but won't be able to query them with integrated tools. + + diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md new file mode 100644 index 00000000000..fd2d163b748 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase2-rn.md @@ -0,0 +1,42 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: dbt Cloud now has two types of jobs -- deploy jobs and CI jobs -- with streamlined setup and improved efficiency. " +sidebar_label: "Update: Improvements to dbt jobs" +tags: [Sept-2023, CI] +date: 2023-09-11 +sidebar_position: 10 +--- + +dbt Cloud now has two distinct job types: [deploy jobs](/docs/deploy/deploy-jobs) for building production data assets, and [continuous integration (CI) jobs](/docs/deploy/ci-jobs) for checking code changes. These jobs perform fundamentally different tasks so dbt Labs improved the setup experience with better defaults for each. + +With two types of jobs, instead of one generic type, we can better guide you through the setup flow. Best practices are built into the default settings so you can go from curious to being set up in seconds. + + + +And, we now have more efficient state comparisons on CI checks: never waste a build or test on code that hasn’t been changed. We now diff between the Git pull request (PR) code and what’s running in production more efficiently with the introduction of deferral to an environment versus a job. To learn more, refer to [Continuous integration in dbt Cloud](/docs/deploy/continuous-integration). + +Below is a comparison table that describes how deploy jobs and CI jobs behave differently: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | + + +## What you need to update + +- If you want to set up a CI environment for your jobs, dbt Labs recommends that you create your CI job in a dedicated [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. To learn more about these environment best practices, refer to the guide [Get started with continuous integration tests](/guides/orchestration/set-up-ci/overview). + +- If you had set up a CI job before October 2, 2023, the job might've been misclassified as a deploy job with this update. Below describes how to fix the job type: + + If you used the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint but didn't set `"triggers":triggers.git_provider_webhook`, the job was misclassified as a deploy job and you must re-create it as described in [Trigger a CI job with the API](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). + + If you used the dbt Cloud UI but didn't enable the **Run on Pull Requests** option that was in the **Continuous Integration** (CI) tab, the job was misclassified as a deploy job and you must re-create it as described in [Set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs). + + To check for the job type, review your CI jobs in dbt Cloud's [Run History](/docs/deploy/run-visibility#run-history) and check for the **CI Job** tag below the job name. If it doesn't have this tag, it was misclassified and you need to re-create the job. + + diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md new file mode 100644 index 00000000000..174de2bdaaf --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/ci-updates-phase3-rn.md @@ -0,0 +1,16 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "September 2023: Improved deletion of temporary schemas" +sidebar_label: "Update: Improved automatic deletion of temporary schemas" +tags: [Sept-2023, CI] +date: 2023-09-18 +sidebar_position: 08 +--- + +Temporary schemas are now being automatically deleted (dropped) for all adapters (like Databricks), PrivateLink connections, and environment variables in connection strings. + +dbt Labs has rearchitected how schema deletion works for [continuous integration (CI)](/docs/deploy/continuous-integration) runs. We created a new service to delete any schema with a prefix of `dbt_cloud_pr_` that's been generated by a PR run. + +However, temporary schemas will not be automatically deleted if: +- Your project overrides the [generate_schema_name macro](/docs/build/custom-schemas) but it doesn't contain the required prefix `dbt_cloud_pr_`. For details, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). +- You're using a [non-native Git integration](/docs/deploy/ci-jobs#trigger-a-ci-job-with-the-api). This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md new file mode 100644 index 00000000000..a647bb5f585 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/product-docs-summer-rn.md @@ -0,0 +1,43 @@ +--- +title: "Summer 2023 product docs updates" +id: "product-docs-summer" +description: "Summer 2023: The Product docs team merged 256 PRs, made various updates to dbt Cloud and Core, such as adding What's New, writing Semantic Layer beta docs, releasing dbt 1.6 docs, and more!" +sidebar_label: "Update: Product docs changes" +tags: [July-2023, Aug-2023, product-docs] +date: 2023-09-13 +sidebar_position: 09 +--- + +Hello from dbt's Product Documentation team (the stewards of the docs.getdbt.com site): @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun. What a busy summer! We merged 256 PRs between July 1st and August 31. + +We'd like to recognize all of the docs and support from our partner team, Developer Experience: @jasnonaz @gwenwindflower @dbeatty10 @dataders @joellabes @Jstein77 @dave-connors-3! + +We'd also like to give a special thanks to the 22 community members who contributed to the [dbt Product docs](https://docs.getdbt.com) for the first time. :pray: Based on feedback from the dbt community, we made these changes: + +- Added a [permissions table](/docs/cloud/manage-access/enterprise-permissions) for Enterprise accounts +- Added a [browser session page](/docs/cloud/about-cloud/browsers#browser-sessions) that clarifies dbt Cloud’s browser session time and when it logs users off. + +You can provide feedback by opening a pull request or issue in [our repo](https://github.com/dbt-labs/docs.getdbt.com) or reaching out in the dbt community Slack channel [#dbt-product-docs](https://getdbt.slack.com/archives/C0441GSRU04)). + +## :zap: General docs projects + +* Added the ability to collapse sections you’re not currently looking at. There were quite a few people who wanted this, and it bugged us too, so we were happy to get this shipped! +* Introduced the idea of [“Trusted” adapters](/docs/supported-data-platforms#types-of-adapters). + +## ☁ Cloud projects + +* The **What’s new?** product update widget is back in the dbt Cloud UI! The Docs team will begin updating the content to keep you informed about new features. +* Launched the re-released [Semantic Layer beta docs](/docs/use-dbt-semantic-layer/dbt-sl), which introduces users to the new API, new guide to set up MetricFlow and the new Semantic Layer, as well as revamp the ‘Use the dbt Semantic Layer’ section for users. +* Updated [Admin API v2 and v3](/docs/dbt-cloud-apis/admin-cloud-api) to help you understand the differences between them and which version includes the endpoints you use. +* To improve discoverability, the docs team made changes to the [deploy dbt sidebar](/docs/deploy/deployments). We added cards and aligned better with the dbt Cloud UI and the way it’s used. +* Deprecated legacy job schemas in the [Discovery API](/docs/dbt-cloud-apis/discovery-api). +* Added a page to describe [experimental and beta features](/docs/dbt-versions/experimental-features) in dbt Cloud and what you need to know about them. +* Added a section to introduce a new beta feature [**Extended Attributes**](/docs/dbt-cloud-environments#extended-attributes-beta), which allows users to set a flexible `profiles.yml` snippet in their dbt Cloud Environment settings. +## 🎯 Core projects + +* We released [dbt 1.6](/guides/migration/versions/upgrading-to-v1.6)! We added docs for the new commands `dbt retry` and `dbt clone` + +## New 📚 Guides, ✏️ blog posts, and FAQs +* Check out how these community members use the dbt community in the [Community spotlight](/community/spotlight). +* Blog posts published this summer include [Optimizing Materialized Views with dbt](/blog/announcing-materialized-views), [Data Vault 2.0 with dbt Cloud](/blog/data-vault-with-dbt-cloud), and [Create dbt Documentation and Tests 10x faster with ChatGPT](/blog/create-dbt-documentation-10x-faster-with-chatgpt) +* We now have two new best practice guides: [How we build our metrics](/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) and [Set up Continuous Integration](/guides/orchestration/set-up-ci/overview). diff --git a/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md new file mode 100644 index 00000000000..0b588376c34 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/04-Sept-2023/removing-prerelease-versions.md @@ -0,0 +1,15 @@ +--- +title: "Update: Removing old (prerelease) versions of dbt from dbt Cloud when (latest) is available" +description: "Sept 2023: Improving the version selection options by removing prerelease versions whenever the latest version is available." +sidebar_label: "Update: Removing old prerelease versions from dbt Cloud" +tags: [Sept-2023, Versions] +date: 2023-09-26 +sidebar_position: 07 +--- + +Previously, when dbt Labs released a new [version](/docs/dbt-versions/core#how-dbt-core-uses-semantic-versioning) in dbt Cloud, the older patch _prerelease_ version and the _latest_ version remained as options in the dropdown menu available in the **Environment settings**. Now, when the _latest_ version is released, the _prerelease_ version will be removed and all customers remaining on it will be migrated seamlessly. There will be no interruptions to service when this migration occurs. + +To see which version you are currently using and to upgrade, select **Deploy** in the top navigation bar and select **Environments**. Choose the preferred environment and click **Settings**. Click **Edit** to make a change to the current dbt version. dbt Labs recommends always using the latest version whenever possible to take advantage of new features and functionality. + + + \ No newline at end of file diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md new file mode 100644 index 00000000000..cd088b92fab --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/deprecation-endpoints-discovery.md @@ -0,0 +1,126 @@ +--- +title: "Deprecation: Query patterns and endpoints in the dbt Cloud Discovery API" +description: "August 2023: Learn about the upcoming deprecation of certain endpoints and query patterns in the Discovery API." +sidebar_position: 6 +sidebar_label: "Deprecation: Certain Discovery API endpoints and query patterns" +tags: [Aug-2023, API] +date: 2023-08-31 +--- + +dbt Labs has deprecated and will be deprecating certain query patterns and replacing them with new conventions to enhance the performance of the dbt Cloud [Discovery API](/docs/dbt-cloud-apis/discovery-api). + +All these changes will be in effect on _September 7, 2023_. + +We understand that these changes might require adjustments to your existing integration with the Discovery API. Please [contact us](mailto:support@getdbt.com) with any questions. We're here to help you during this transition period. + +## Job-based queries + +Job-based queries that use the data type `Int` for IDs will be deprecated. They will be marked as deprecated in the [GraphQL explorer](https://metadata.cloud.getdbt.com/graphql). The new convention will be for you to use the data type `BigInt` instead. + +This change will be in effect starting September 7, 2023. + + +Example of query before deprecation: + +```graphql +query ($jobId: Int!) { + models(jobId: $jobId){ + uniqueId + } +} +``` + +Example of query after deprecation: + +```graphql +query ($jobId: BigInt!) { + job(id: $jobId) { + models { + uniqueId + } + } +} +``` + +## modelByEnvironment queries + +The `modelByEnvironment` object has been renamed and moved into the `environment` object. This change is in effect and has been since August 15, 2023. + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $uniqueId: String) { + modelByEnvironment(environmentId: $environmentId, uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } +} +``` + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $uniqueId: String) { + environment(id: $environmentId) { + applied { + modelHistoricalRuns(uniqueId: $uniqueId) { + uniqueId + executionTime + executeCompletedAt + } + } + } +} +``` + + +## Environment and account queries + +Environment and account queries that use `Int` as a data type for ID have been deprecated. IDs must now be in `BigInt`. This change is in effect and has been since August 15, 2023. + + +Example of query before deprecation: + +```graphql +query ($environmentId: Int!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + +Example of query after deprecation: + +```graphql +query ($environmentId: BigInt!, $first: Int!) { + environment(id: $environmentId) { + applied { + models(first: $first) { + edges { + node { + uniqueId + executionInfo { + lastRunId + } + } + } + } + } + } +} +``` + + diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md new file mode 100644 index 00000000000..10baa5cd6d7 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/ide-v1.2.md @@ -0,0 +1,38 @@ +--- +title: "Update: Cloud IDE v1.2 includes a new service" +description: "August 2023: Cloud IDE now uses dbt-server to provide more reliable service and dbt Core feature parity, including support for commands like `dbt list`." +sidebar_label: "Update: Cloud IDE v1.2" +tags: [Aug-2023, IDE] +date: 2023-08-03 +sidebar_position: 8 +--- + +We're excited to announce that we replaced the backend service that powers the Cloud IDE with a more reliable server -- dbt-server. Because this release contains foundational changes, IDE v1.2 requires dbt v1.6 or higher. This significant update follows the rebuild of the IDE frontend last year. We're committed to improving the IDE to provide you with a better experience. + +Previously, the Cloud IDE used dbt-rpc, an outdated service that was unable to stay up-to-date with changes from dbt-core. The dbt-rpc integration used legacy dbt-core entry points and logging systems, causing it to be sluggish, brittle, and poorly tested. The Core team had been working around this outdated technology to avoid breaking it, which prevented them from developing with velocity and confidence. + +## New features + +- **Better dbt-core parity:** The Cloud IDE has better command parity with dbt-core, including support for commands like `dbt list` and improved treatment of flags like `--vars`, `--fail-fast`, etc. +- **Improved maintainability:** With the new dbt-server, it's easier to fix bugs and improve the overall quality of the product. With dbt-rpc, fixing bugs was a time-consuming and challenging process that required extensive testing. With the new service, we can identify and fix bugs more quickly, resulting in a more stable and reliable IDE. +- **A more reliable service:** Simplified architecture that's less prone to failure. + +### Product refinements + +- Improved `Preview` capabilities with Core v1.6 + IDE v1.2. [This Loom](https://www.loom.com/share/12838feb77bf463c8585fc1fc6aa161b) provides more information. + +### Bug fixes + +- Global page can become "inert" and stop handling clicks +- Switching back and forth between files in the git diff view can cause overwrite +- Browser gets stuck during markdown preview for doc with large table +- Editor right click menu is offset +- Unable to Cancel on the Save New File component when Closing All Files in the IDE +- Mouse flicker in the modal's file tree makes it difficult to select a folder where you want to save a new file +- Snapshots not showing in Lineage when inside a subfolder and is mixed cased named +- Tooltips do not work for Format and Save +- When a dbt invocation is in progress or if parsing is ongoing, attempting to switch branches will cause the `Git Branch` dropdown to close automatically + +### Known issues + +- `{{this}}` function does not display properly in preview/compile with dbt-server diff --git a/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md new file mode 100644 index 00000000000..921ed6dcd79 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/05-Aug-2023/sl-revamp-beta.md @@ -0,0 +1,65 @@ +--- +title: "Enhancement: Revamped dbt Semantic Layer available in public beta" +description: "August 2023: The revamped dbt Semantic Layer, now available in public beta, introduces new semantic components and evolves the semantic layer's capability." +sidebar_label: "Enhancement: Revamped dbt Semantic Layer in public beta" +tags: [Aug-2023, dbt Semantic Layer] +date: 2023-08-03 +sidebar_position: 7 +--- + +:::important +If you're using the legacy Semantic Layer, we **highly** recommend you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the new dbt Semantic Layer. To migrate to the new Semantic Layer, refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. +::: + +dbt Labs are thrilled to announce the re-release of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl), now available in [public beta](#public-beta). It aims to bring the best of modeling and semantics to downstream applications by introducing: + +- [MetricFlow](/docs/build/about-metricflow) is a framework for constructing performant and legible SQL from an all new set of semantic constructs which include semantic models, entities, and metrics. +- New Semantic Layer infrastructure that enables support for more data platforms (Snowflake, Databricks, BigQuery, Redshift, and soon more), along with improved performance. +- New and improved [developer workflows](/guides/migration/sl-migration), governance, and collaboration features. +- New [Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) using JDBC to query metrics and build integrations. + +With semantics at its core, the dbt Semantic Layer marks a crucial milestone towards a new era of centralized logic and data applications. + + + +## Enhanced dbt Semantic Layer + +What sets the dbt Semantic Layer apart is its ability to centralize logic for many downstream data applications, streamlining access and governance and enabling more efficient utilization of data models. It provides a consistent view of data while simplifying complex tasks in downstream applications and reducing the costs of and barriers to data access. + +We are excited to present several important capabilities with the enhanced dbt Semantic Layer: + +- **Consistent organization**: Provides a consistent view of data, ensuring that metrics and definitions match across the organization and the breadth of interfaces where data is consumed. This fosters trust in data and drives better decision-making by eliminating inconsistencies and errors that come up when individual users define metrics independently. + +- **Improved governance**: The dbt Semantic Layer ensures proper governance and auditing of data changes, providing an auditable record of modifications and clear ownership. This saves time by making it clear who can create and manage new metrics, ensuring accountability and data integrity. + +- **Reduce costs**: The dbt Semantic Layer simplifies complex tasks, such as bridging entities across a semantic graph. Often users duplicate slices and dice of data and make them available in a data platform, making it difficult to manage and causing high computation. The dbt Semantic Layer minimizes duplication of work and reduces computational costs - allowing users to focus on analyzing data rather than navigating intricate technical processes or duplicating work. + +- **Enhanced efficiency**: With the dbt Semantic Layer, data teams can create and update metrics using a new set of validations that make defining and iterating on metrics efficient. The streamlined development workflows makes it simpler for a data team to serve large organizations with broad data needs. + +- **Accessible data**: Defining common metrics and dimensions and making them joinable, makes access simpler for users with less expertise in the specifics of a company's data modeling work. This creates opportunities to leverage data insights, fostering collaboration and driving innovation in a more inclusive data environment. + +By bringing these enhancements to the dbt Semantic Layer, we enable organizations of all sizes and industries to leverage the power of semantics in their data workflows. + +## Public beta + +The dbt Semantic Layer is currently available as a public beta, which means: + +- **Who** — To experience the new dbt Semantic Layer, you must be on a dbt Cloud [Team and Enterprise](https://www.getdbt.com/pricing/) multi-tenant dbt Cloud plan, [hosted](/docs/cloud/about-cloud/regions-ip-addresses) in North America and on dbt v1.6 and higher. Look out for announcements on removing the location requirement soon. + + - Developer plans or dbt Core users can use MetricFlow to define and test metrics using the dbt MetricFlow CLI only. + +- **What** — Public beta provides early access to new features. The dbt Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to general availability later this year. We may also introduce new functionality that isn't backwards compatible. We provide support, and relevant service level objectives (SLOs) apply. If you have any questions on pricing, please reach out to your account representative. + +- **When** — Public beta starts on August 1st, 2023. + +- **Where** — You can experience the dbt Semantic Layer in dbt Cloud. Public beta is enabled at the account level so you don’t need to worry about enabling it per user. + +## Next steps + +To experience the universal dbt Semantic Layer and its enhanced beta capabilities, check out: + +- [Introducing the new dbt Semantic Layer](https://www.getdbt.com/blog/introducing-new-look-dbt-semantic-layer) +- [dbt Semantic Layer docs](/docs/use-dbt-semantic-layer/dbt-sl) +- [dbt Semantic Layer get started guide](/docs/use-dbt-semantic-layer/quickstart-sl) +- [Build your metrics with MetricFlow](/docs/build/build-metrics-intro) + diff --git a/website/docs/docs/dbt-versions/release-notes/05-Sept-2022/liststeps-endpoint-deprecation.md b/website/docs/docs/dbt-versions/release-notes/05-Sept-2022/liststeps-endpoint-deprecation.md deleted file mode 100644 index 07eb72986f2..00000000000 --- a/website/docs/docs/dbt-versions/release-notes/05-Sept-2022/liststeps-endpoint-deprecation.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: "List Steps API endpoint deprecation warning" -id: "liststeps-endpoint-deprecation.md" -description: "List Steps API deprecation" -sidebar_label: "Deprecation: List Steps API endpoint" -tags: [Sept-15-2022] ---- - -On October 14th, 2022 dbt Labs is deprecating the [List Steps](https://docs.getdbt.com/dbt-cloud/api-v2#tag/Runs/operation/listSteps) API endpoint. From October 14th, any GET requests to this endpoint will fail. Please prepare to stop using the List Steps endpoint as soon as possible. - -dbt Labs will continue to maintain the [Get Run](https://docs.getdbt.com/dbt-cloud/api-v2#tag/Runs/operation/getRunById) endpoint, which is a viable alternative depending on the use case. - -You can fetch run steps for an individual run with a GET request to the following URL: - -`https://cloud.getdbt.com/api/v2/accounts/{accountId}/runs/{runId}/?include_related=["run_steps"]` diff --git a/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md new file mode 100644 index 00000000000..ba82234c0b5 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/06-July-2023/faster-run.md @@ -0,0 +1,34 @@ +--- +title: "Enhancement: Faster run starts and unlimited job concurrency" +description: "We have enhanced the dbt Cloud Scheduler by reducing prep time for all accounts and provided unlimited job concurrency for Enterprise accounts." +sidebar_label: "Enhancement: Faster run starts and unlimited job concurrency" +tags: [July-2023, scheduler] +date: 2023-07-06 +sidebar_position: 10 +--- + +We’ve introduced significant improvements to the dbt Cloud Scheduler, offering improved performance, durability, and scalability. + +Read more on how you can experience faster run start execution and how enterprise users can now run as many jobs concurrently as they want to. + +## Faster run starts + +The Scheduler takes care of preparing each dbt Cloud job to run in your cloud data platform. This [prep](/docs/deploy/job-scheduler#scheduler-queue) involves readying a Kubernetes pod with the right version of dbt installed, setting environment variables, loading data platform credentials, and git provider authorization, amongst other environment-setting tasks. Only after the environment is set up, can dbt execution begin. We display this time to the user in dbt Cloud as “prep time”. + + + +For all its strengths, Kubernetes has challenges, especially with pod management impacting run execution time. We’ve rebuilt our scheduler by ensuring faster job execution with a ready pool of pods to execute customers’ jobs. This means you won't experience long prep times at the top of the hour, and we’re determined to keep runs starting near instantaneously. Don’t just take our word, review the data yourself. + + + +Jobs scheduled at the top of the hour used to take over 106 seconds to prepare because of the volume of runs the scheduler has to process. Now, even with increased runs, we have reduced prep time to 27 secs (at a maximum) — a 75% speed improvement for runs at peak traffic times! + +## Unlimited job concurrency for Enterprise accounts + +Our enhanced scheduler offers more durability and empowers users to run jobs effortlessly. + +This means Enterprise, multi-tenant accounts can now enjoy the advantages of unlimited job concurrency. Previously limited to a fixed number of run slots, Enterprise accounts now have the freedom to operate without constraints. Single-tenant support will be coming soon. Team plan customers will continue to have only 2 run slots. + +Something to note, each running job occupies a run slot for its duration, and if all slots are occupied, jobs will queue accordingly. + +For more feature details, refer to the [dbt Cloud pricing page](https://www.getdbt.com/pricing/). diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/admin-api-rn.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/admin-api-rn.md new file mode 100644 index 00000000000..2008331ebe6 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/admin-api-rn.md @@ -0,0 +1,15 @@ +--- +title: "Update: dbt Cloud Administrative API docs for v2 and v3" +description: "June 2023 release note: The Administrative API docs are now available for v2 and v3 with a different UI." +sidebar_label: "Update: Admin API docs for v2 and v3 " +tags: [June-2023, API] +sidebar_position: 9 +--- + +dbt Labs updated the docs for the [dbt Cloud Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) and they are now available for both [v2](/dbt-cloud/api-v2#/) and [v3](/dbt-cloud/api-v3#/). + +- Now using Spotlight for improved UI and UX. +- All endpoints are now documented for v2 and v3. Added automation to the docs so they remain up to date. +- Documented many of the request and response bodies. +- You can now test endpoints directly from within the API docs. And, you can choose which [regional server](/docs/cloud/about-cloud/regions-ip-addresses) to use (North America, APAC, or EMEA). +- With the new UI, you can more easily generate code for any endpoint. diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md new file mode 100644 index 00000000000..fa02a6d9bd8 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/ci-updates-phase1-rn.md @@ -0,0 +1,24 @@ +--- +title: "Update: Improvements to dbt Cloud continuous integration" +description: "dbt Cloud's CI checks now run in parallel, will not block production runs, and stale runs are automatically canceled when a newer commit is pushed." +sidebar_label: "Update: Improvements to continuous integration" +tags: [June-2023, CI] +date: 2023-06-20 +sidebar_position: 8 +--- + +dbt Cloud CI is a critical part of the analytics engineering workflow. Large teams rely on process to ensure code quality is high, and they look to dbt Cloud CI to automate testing code changes in an efficient way, enabling speed while keep the bar high. With status checks directly posted to their dbt PRs, developers gain the confidence that their code changes will work as expected in production, and once you’ve grown accustomed to seeing that green status check in your PR, you won’t be able to work any other way. + + + +What separates dbt Cloud CI from other CI providers is its ability to keep track of state of what’s running in your production environment, so that when you run a CI job, only the modified data assets in your pull request and their downstream dependencies get built and tested in a staging schema. dbt Cloud aims to make each CI check as efficient as possible, so as to not waste any data warehouse resources. As soon as the CI run completes, its status posts directly back to the PR in GitHub, GitLab, or Azure DevOps, depending on which Git provider you’re using. Teams can set up guardrails to let only PRs with successful CI checks be approved for merging, and the peer review process is greatly streamlined because dbt Cloud does the first testing pass. + +We're excited to introduce a few critical capabilities to dbt Cloud CI that will improve productivity and collaboration in your team’s testing and integration workflow. As of this week, you can now: + +- **Run multiple CI checks in parallel**. If more than one contributor makes changes to the same dbt project in dbt Cloud in short succession, the later arriving CI check no longer has to wait for the first check to complete. Both checks will execute concurrently. + +- **Automatically cancel stale CI runs**. If you push multiple commits to the same PR, dbt Cloud will cancel older, now-out-of-date CI checks automatically. No resources wasted on checking stale code. + +- **Run CI checks without blocking production runs**. CI checks will no longer consume run slots, meaning you can have as many CI checks running as you want, without impeding your production jobs. + +To learn more, refer to [Continuous integration](/docs/deploy/continuous-integration) and [CI jobs](/docs/deploy/ci-jobs). diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/lint-format-rn.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/lint-format-rn.md new file mode 100644 index 00000000000..e99d1fe3e0b --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/lint-format-rn.md @@ -0,0 +1,26 @@ +--- +title: "New: You can now lint and format your code in the IDE" +id: "lint-format-rn" +description: "June 2023 release note: In the dbt Cloud IDE, you can perform linting and formatting on SQL, YAML, Markdown, Python, and JSON files using tools like SQLFluff, sqlfmt, Prettier, and Black." +sidebar_label: "New: Lint and format in the IDE" +sidebar_position: 10 +tags: [June-2023, IDE] +--- + +dbt Labs is excited to announce you can now lint and format your dbt code in the dbt Cloud IDE. This is an enhanced development workflow which empowers you to effortlessly prioritize code quality. + +You can perform linting and formatting on five different file types: SQL, YAML, Markdown, Python, and JSON. + +For SQL files, you can easily lint and format your code using [SQLFluff](https://sqlfluff.com/) and apply consistent formatting using [sqlfmt](http://sqlfmt.com/). Additionally, for other file types like YAML, Markdown, JSON, and Python, you can utilize the respective tools powered by [Prettier](https://prettier.io/) and [Black](https://black.readthedocs.io/en/latest/) to ensure clean and standardized code formatting. + +For more info, read [Lint and format your code](/docs/cloud/dbt-cloud-ide/lint-format). + + + + + + + + + + diff --git a/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md new file mode 100644 index 00000000000..469d2ac362b --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/07-June-2023/product-docs-jun.md @@ -0,0 +1,35 @@ +--- +title: "June 2023 product docs updates" +description: "June 2023: The Product docs team merged 132 PRs, made various updates to dbt Cloud and Core, such as the Deploy sidebar, Supported platforms page, added a landing page on the References section, added an ADO example to the CI/CD guide, and more" +sidebar_label: "Update: Product docs changes" +tags: [June-2023, product-docs] +date: 2023-07-04 +sidebar_position: 10 +--- + +Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun! First, we’d like to thank the 17 new community contributors to docs.getdbt.com — ✨ @aaronbini, @sjaureguimodo, @aranke, @eiof, @tlochner95, @mani-dbt, @iamtodor, @monilondo, @vrfn, @raginjason, @AndrewRTsao, @MitchellBarker, @ajaythomas, @smitsrr, @leoguyaux, @GideonShils, @michaelmherrera! + +Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in June: + +## ☁ Cloud projects + +- We clarified the nuances of [CI and CI jobs](/docs/deploy/continuous-integration), updated the [Scheduler content](/docs/deploy/job-scheduler), added two new pages for the job settings and run visibility, moved the project state page to the [Syntax page](/reference/node-selection/syntax), and provided a landing page for [Deploying with Cloud](/docs/deploy/jobs) to help readers navigate the content better. +- We reformatted the [Supported data platforms page](/docs/supported-data-platforms) by adding dbt Cloud to the page, splitting it into multiple pages, using cards to display verified adapters, and moving the [Warehouse setup pages](/docs/core/connect-data-platform/about-core-connections) to the Docs section. +- We launched a new [Lint and format page](/docs/cloud/dbt-cloud-ide/lint-format), which highlights the awesome new dbt Cloud IDE linting/formatting function. +- We enabled a connection between [dbt Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) and the dbt Slack community. This means new dbt Cloud release notes are automatically sent to the slack community [#dbt-cloud channel](https://getdbt.slack.com/archives/CMZ2V0X8V) via RSS feed, keeping users up to date with changes that may affect them. +- We’ve added two new docs links in the dbt Cloud Job settings user interface (UI). This will provide additional guidance and help users succeed when setting up a dbt Cloud job: [job commands](/docs/deploy/job-commands) and job triggers. +- We added information related to the newly created [IT license](/docs/cloud/manage-access/about-user-access#license-based-access-control), available for Team and Enterprise plans. +- We added a new [Supported browser page](/docs/cloud/about-cloud/browsers), which lists the recommended browsers for dbt Cloud. +- We launched a new page informing users of [new Experimental features option](/docs/dbt-versions/experimental-features) in dbt Cloud. +- We worked with dbt Engineering to help publish new beta versions of the dbt [dbt Cloud Administrative API docs](/docs/dbt-cloud-apis/admin-cloud-api). + + +## 🎯 Core projects + +- We launched the new [MetricFlow docs](/docs/build/build-metrics-intro) on dbt Core v1.6 beta. +- Split [Global configs](reference/global-configs/about-global-configs) into individual pages, making it easier to find, especially using search. + + +## New 📚 Guides, ✏️ blog posts, and FAQs + +- Add an Azure DevOps example to the [Customizing CI/CD guide](/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge). diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/discovery-api-public-preview.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/discovery-api-public-preview.md new file mode 100644 index 00000000000..abe0e06f164 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/discovery-api-public-preview.md @@ -0,0 +1,24 @@ +--- +title: "Latest environment state in the dbt Cloud Discovery API" +id: "discovery-api-public-preview" +description: "Public preview of the dbt Cloud Discovery API is now available." +sidebar_label: "New: Latest environment state in the Discovery API" +sidebar_position: 5 +tags: [May-2023, API] +--- + +Users of the Discovery API can now query the latest state of their environment, meaning there's no need to consolidate results across jobs or artifact files. The environment essentially represents the latest production state of a dbt project. The new `environment` endpoint is in public preview and can be used with the existing `modelByEnvironment` endpoint for historical analysis. For details, refer to these docs: + +- [About the Discovery API](/docs/dbt-cloud-apis/discovery-api) +- [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) +- [Query the Discovery API](/docs/dbt-cloud-apis/discovery-querying) +- [Project state in dbt Cloud](/docs/dbt-cloud-apis/project-state) + +## Roadmap + +dbt Labs is continually enhancing the dbt Cloud Discovery API to ensure you have access to timely, rich, and reliable metadata about your dbt runs. In the coming year, we’ll expand the range of questions the API helps answer so you can more easily understand the state, meaning, and structure of your data to inform data development and analysis experiences. + +- **[Now] Query across jobs & lineage** — Get the latest state of a dbt DAG (production environment) to find, understand, and trust the right dataset to analyze. + - **[Ongoing] Improvements** — Enhanced developer ergonomics, state fidelity, and metadata timeliness. +- **[Soon] Query across projects** — View and manage cross-project lineage using public models to define, use, and manage governed datasets for enhanced collaboration across teams. +- **[Later] Query over time** — Understand longer-term dbt Cloud execution result trends to optimize pipeline performance and costs, such as improving costly, error-prone, or slow datasets. diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md new file mode 100644 index 00000000000..d85ffa154dd --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/may-ide-updates.md @@ -0,0 +1,46 @@ +--- +title: "May IDE updates and fixes" +id: "may-ide-updates" +description: "May 2023 release note: We've launched SQLFluff in beta, released an IDE UI page, significantly improved IDE performance, improved error messages, fixed bugs, and more." +sidebar_label: "Update and fixes: IDE" +sidebar_position: 2 +tags: [May-2023, IDE] +--- + +To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continues to work on adding new features, fixing bugs, and increasing reliability ✨. + +Stay up-to-date with [IDE-related changes](/tags/ide). + +## New features +- Lint via SQL Fluff is now available in beta (GA over the next 2-3 weeks) +- Format markdown files with prettier +- Leverage developer experience shortcuts, including ``Ctrl + ` `` (toggle history drawer), `CMD + Option + /` (toggle block comment), `CMD + Shift + P` (open command palette), `Option + W` (close editor tab) +- Display parent folder name for files with same name in Changes section +- Navigate the new IDE features quickly using [the IDE User Interface](/docs/cloud/dbt-cloud-ide/ide-user-interface) help page +- Use `top X` in SQL when previewing in the IDE +- Opt into the new IDE backend layer over the past month (still with dbt-rpc). Ready for beta later in June! + + +## Product refinements + +- Performance-related upgrades: + - Reduced cold start time by 60+% + - Improved render time of modals in the IDE by 98% + - Improved IDE performance with dbt Core v1.5+ (faster and snappier – highly encourage you to [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud)!) +- Upgraded sqlfmt (which powers the Format button) to 0.18.0 +- Updated Build button to change menu options based on file/model type (snapshot, macro, etc.) +- Display message to disable adblocker for file contents error +- Moved Format button to console bar +- Made many security enhancements in the IDE +## Bug fixes + +- File icon sizes no longer get wonky in small screen +- Toast notifications no longer take over command bar menu +- Hover info inside the text editor no longer gets cut off +- Transition between a file and a recently modified scratchpad no longer triggers a console error +- dbt v1.5+ now can access the IDE +- Confirm button on the Unsaved Changes modal now closes after clicking it +- Long node names no longer overflow in the parsed logs section in history drawer +- Status pill in history drawer no longer scales with longer command +- Tooltip for tab name with a long file name is no longer cut off +- Lint button should no longer available in main branch diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md new file mode 100644 index 00000000000..762a6a723f8 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/product-docs-may.md @@ -0,0 +1,43 @@ +--- +title: "May 2023 product docs updates" +id: "May-product-docs" +description: "May 2023: Find out what the product docs team has been busy doing in the month of May." +sidebar_label: "Update: Product docs changes" +sidebar_position: 1 +tags: [May-2023, product-docs] +date: 2023-06-01 +--- + +Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun! First, we’d like to thank the 13 new community contributors to docs.getdbt.com! + +Here's what's new to [docs.getdbt.com](http://docs.getdbt.com/) in May: + +## 🔎 Discoverability + +- We made sure everyone knows that Cloud-users don’t need a [profiles.yml file](/docs/core/connect-data-platform/profiles.yml) by adding a callout on several key pages. +- Fleshed out the [model jinja variable page](/reference/dbt-jinja-functions/model), which originally lacked conceptual info and didn’t link to the schema page. +- Added a new [Quickstarts landing page](/quickstarts). This new format sets up for future iterations that will include filtering! But for now, we are excited you can step through quickstarts in a focused way. + +## ☁ Cloud projects + +- We launched [dbt Cloud IDE user interface doc](/docs/cloud/dbt-cloud-ide/ide-user-interface), which provides a thorough walkthrough of the IDE UI elements and their definitions. +- Launched a sparkling new [dbt Cloud Scheduler page](/docs/deploy/job-scheduler) ✨! We went from previously having little content around the scheduler to a subsection that breaks down the awesome scheduler features and how it works. +- Updated the [dbt Cloud user license page](/docs/cloud/manage-access/seats-and-users#licenses) to clarify how to add or remove cloud users. +- Shipped these Discovery API docs to coincide with the launch of the Discovery API: + - [About the Discovery API](/docs/dbt-cloud-apis/discovery-api) + - [Use cases and examples for the Discovery API](/docs/dbt-cloud-apis/discovery-use-cases-and-examples) + - [Query the Discovery API](/docs/dbt-cloud-apis/discovery-querying) + +## 🎯 Core projects + +- See what’s coming up [in Core v 1.6](https://github.com/dbt-labs/docs.getdbt.com/issues?q=is%3Aissue+label%3A%22dbt-core+v1.6%22)! +- We turned the `profiles.yml` [page](/docs/core/connect-data-platform/profiles.yml) into a landing page, added more context to profile.yml page, and moved the ‘About CLI’ higher up in the `Set up dbt` section. + +## New 📚 Guides, ✏️ blog posts, and FAQs + +If you want to contribute to a blog post, we’re focusing on content + +- Published a blog post: [Accelerate your documentation workflow: Generate docs for whole folders at once](/blog/generating-dynamic-docs-dbt) +- Published a blog post: [Data engineers + dbt v1.5: Evolving the craft for scale](/blog/evolving-data-engineer-craft) +- Added an [FAQ](/faqs/Warehouse/db-connection-dbt-compile) to clarify the common question users have on *Why does dbt compile needs to connect to the database?* +- Published a [discourse article](https://discourse.getdbt.com/t/how-to-configure-external-user-email-notifications-in-dbt-cloud/8393) about configuring job notifications for non-dbt Cloud users diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-details-and-logs-improvements.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-details-and-logs-improvements.md new file mode 100644 index 00000000000..1aabe517076 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-details-and-logs-improvements.md @@ -0,0 +1,19 @@ +--- +title: "Run details and logs usability and design improvements" +sidebar_label: "Update: Improvements to run details and logs" +sidebar_position: 3 +tags: [May-2023, Scheduler] +--- + +New usability and design improvements to the run details and logs in dbt Cloud are now available. The ability to triage errors in logs is a big benefit of using dbt Cloud's job and scheduler functionality. These updates help make the process of finding the root cause much easier. + +Highlights include: + +- Surfacing a warn state on a run step +- Search in logs +- Easier discoverability of errors and warnings in logs +- Lazy loading of logs, making the whole run details page load faster and feel more performant +- Cleaner look and feel with iconography +- Helpful tool tips + + diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-endpoint.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-endpoint.md new file mode 100644 index 00000000000..050fd8339a2 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-endpoint.md @@ -0,0 +1,20 @@ +--- +title: "Older Run History retrieval change" +id: "run-history-endpoint" +sidebar_label: "Update: Older Run History retrieval change" +sidebar_position: 6 +tags: [May-2023, API, Scheduler] +--- + +dbt Labs is making a change to the metadata retrieval policy for Run History in dbt Cloud. + + +**Beginning June 1, 2023,** developers on the dbt Cloud multi-tenant application will be able to self-serve access to their account’s run history through the dbt Cloud user interface (UI) and API for only 365 days, on a rolling basis. Older run history will be available for download by reaching out to Customer Support. We're seeking to minimize the amount of metadata we store while maximizing application performance. + + +Specifically, all `GET` requests to the dbt Cloud [Runs endpoint](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Runs) will return information on runs, artifacts, logs, and run steps only for the past 365 days. Additionally, the run history displayed in the dbt Cloud UI will only show runs for the past 365 days. + + + +We will retain older run history in cold storage and can make it available to customers who reach out to our Support team. To request older run history info, contact the Support team at [support@getdbt.com](mailto:support@getdbt.com) or use the dbt Cloud application chat by clicking the `?` icon in the dbt Cloud UI. + diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-improvements.md b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-improvements.md new file mode 100644 index 00000000000..d4d299b1d36 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/08-May-2023/run-history-improvements.md @@ -0,0 +1,21 @@ +--- +title: "Run History usability and design improvements" +sidebar_label: "Update: Run History improvements" +sidebar_position: 4 +tags: [May-2023, Scheduler] +--- + +New usability and design improvements to the **Run History** dashboard in dbt Cloud are now available. These updates allow people to discover the information they need more easily by reducing the number of clicks, surfacing more relevant information, keeping people in flow state, and designing the look and feel to be more intuitive to use. + + + + +Highlights include: + +- Usability improvements for CI runs with hyperlinks to the branch, PR, and commit SHA, along with more discoverable temporary schema names +- Preview of runs' error messages on hover +- Hyperlinks to the environment +- Better iconography on run status +- Clearer run trigger cause (API, scheduled, pull request, triggered by user) +- More details on the schedule time on hover +- Run timeout visibility diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/api-endpoint-restriction.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/api-endpoint-restriction.md new file mode 100644 index 00000000000..8507fe3dbbb --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/api-endpoint-restriction.md @@ -0,0 +1,23 @@ +--- +title: "List Runs API Endpoint `order_by` restrictions" +id: "api-endpoint-restriction" +sidebar_label: "Deprecation: List Runs API Endpoint order_by restrictions" +sidebar_position: 10 +tags: [Apr-2023, API] +--- + +Starting May 15, 2023, we will support only the following `order_by` functionality for the List Runs endpoint: + +- `id` and `-id` +- `created_at` and `-created_at` +- `finished_at` and `-finished_at` + +We recommend that you change your API requests to https:///api/v2/accounts/{accountId}/runs/ to use a supported `order_by` before this date. + +:::info Access URLs + +dbt Cloud is hosted in multiple regions around the world, and each region has a different access URL. Users on Enterprise plans can choose to have their account hosted in any one of these regions. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). + +::: + +For more info, refer to our [documentation](/dbt-cloud/api-v2-legacy#tag/Runs/operation/listRunsForAccount). diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/apr-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/apr-ide-updates.md new file mode 100644 index 00000000000..0960feb0e2f --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/apr-ide-updates.md @@ -0,0 +1,45 @@ +--- +title: "April IDE updates and fixes" +id: "apr-ide-updates" +description: "Apr 2023 release note: We've enhanced the IDE by displaying load times when previewing models, showing live previews of Markdown and CSV files, adding the ability to duplicate files in the File Tree, and more." +sidebar_label: "Update and fixes: IDE" +sidebar_position: 7 +tags: [Apr-2023, IDE] +--- + +To continue improving your [Cloud IDE](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud) development experience, the dbt Labs team continue to work on adding new features, fixing bugs, and increasing reliability ✨. + +Read more about the [upcoming improvements to the Cloud IDE](https://www.getdbt.com/blog/improvements-to-the-dbt-cloud-ide/) and stay up-to-date with [IDE-related changes](https://docs.getdbt.com/tags/ide). + +## New features + +* New warning message suggests you invoke `dbt deps` when it's needed (as informed by `dbt-score`). +* New warning message appears when you select models but don't save them before clicking **Build** or invoking dbt (like, dbt build/run/test). +* Previews of Markdown and CSV files are now available in the IDE console. +* The file tree menu now includes a Duplicate File option. +* Display loading time when previewing a model + +## Product refinements + +* Enhance autocomplete experience which has performed slowly for people with large projects and who implement a limit to max `manifest.json` for this feature +* Introduce pagination for invocation node summary view (displaying 100 nodes at a time) +* Improve rendering for the Changes / Version Control section of the IDE +* Update icons to be consistent in dbt Cloud +* Add table support to the Markdown preview +* Add the lineage tab back to seed resources in the IDE +* Implement modal priority when there are multiple warning modals +* Improve a complex command's description in the command palette + +## Bug fixes + +* File tree no longer collapses on first click when there's a project subdirectory defined +* **Revert all** button now works as expected +* CSV preview no longer fails with only one column +* Cursor and scroll bar location are now persistent with their positions +* `git diff` view now shows just change diffs and no longer shows full diff (as if file is new) until page refreshes +* ToggleMinimap Command no longer runs another Command at the same time +* `git diff` view no longer shows infinite spins in specific scenarios (new file, etc.) +* File contents no longer get mixed up when using diff view and one file has unsaved changes +* YML lineage now renders model without tests (in dbt Core v1.5 and above) +* Radio buttons for **Summary** and **Details** in the logs section now consistently update to show the accurate tab selection +* IDE no longer throws the console error `Error: Illegal argument` and redirects to the `Something went wrong` page diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md new file mode 100644 index 00000000000..d30bcf85b99 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/product-docs.md @@ -0,0 +1,40 @@ +--- +title: "April 2023 product docs updates" +id: "april-product-docs" +description: "April 2023: " +sidebar_label: "Update: product docs changes" +sidebar_position: 8 +tags: [Apr-2023, product-docs] +--- + +Hello from the dbt Docs team: @mirnawong1, @matthewshaver, @nghi-ly, and @runleonarun! We want to share some highlights introduced to docs.getdbt.com in the last month: + +## 🔎 Discoverability + +- [API docs](/docs/dbt-cloud-apis/overview) now live in the left sidebar to improve discoverability. +- [The deploy dbt jobs sidebar](/docs/deploy/deployments) has had a glow up 💅 that splits the ‘about deployment’ into two paths (deploy w dbt cloud and deploy w other tools), adds more info about the dbt cloud scheduler, its features, and how to create a job, adds ADF deployment guidance. We hope the changes improve the user experience and provide users with guidance when deploying with other tools. + +## ☁ Cloud projects + +- Added Starburst/Trino adapter docs, including: + * [dbt Cloud quickstart guide](/quickstarts/starburst-galaxy),  + * [connection page](/docs/cloud/connect-data-platform/connect-starburst-trino),  + * [set up page](/docs/core/connect-data-platform/trino-setup), and [config page](/reference/resource-configs/trino-configs). +- Enhanced [dbt Cloud jobs page](/docs/deploy/jobs) and section to include conceptual info on the queue time, improvements made around it, and about failed jobs. +- Check out the April dbt [Cloud release notes](/docs/dbt-versions/dbt-cloud-release-notes) + +## 🎯 Core projects + +- Clearer descriptions in the [Jinja functions page](/reference/dbt-jinja-functions), that improve content for each card.  +- [1.5 Docs](/guides/migration/versions/upgrading-to-v1.5) have been released as an RC! +- See the beautiful [work captured in Core v 1.5](https://github.com/dbt-labs/docs.getdbt.com/issues?q=is%3Aissue+label%3A%22dbt-core+v1.5%22+is%3Aclosed). + +## New 📚 Guides and ✏️ blog posts + +- [Use Databricks workflows to run dbt Cloud jobs](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs) +- [Refresh Tableau workbook with extracts after a job finishes](/guides/orchestration/webhooks/zapier-refresh-tableau-workbook) +- [dbt Python Snowpark workshop/tutorial](/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark) +- [How to optimize and troubleshoot dbt Models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks) +- [The missing guide to debug() in dbt](https://docs.getdbt.com/blog/guide-to-jinja-debug) +- [dbt Squared: Leveraging dbt Core and dbt Cloud together at scale](https://docs.getdbt.com/blog/dbt-squared) +- [Audit_helper in dbt: Bringing data auditing to a higher level](https://docs.getdbt.com/blog/audit-helper-for-migration) diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/scheduler-optimized.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/scheduler-optimized.md new file mode 100644 index 00000000000..e552c449808 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/scheduler-optimized.md @@ -0,0 +1,16 @@ +--- +title: "The dbt Cloud scheduler now prevents queue clog by canceling unnecessary runs of over-scheduled jobs" +id: "scheduler-optimized" +description: "April 2023: " +sidebar_label: "Update: Scheduler optimizes job queue" +sidebar_position: 9 +tags: [Apr-2023, scheduler] +--- + +The dbt Cloud Scheduler now prevents queue clog by canceling unnecessary runs of over-scheduled jobs. + +The duration of a job run tends to grow over time, usually caused by growing amounts of data in the warehouse. If the run duration becomes longer than the frequency of the job’s schedule, the queue will grow faster than the scheduler can process the job’s runs, leading to a runaway queue with runs that don’t need to be processed. + +Previously, when a job was in this over-scheduled state, the scheduler would stop queuing runs after 50 were already in the queue. This led to a poor user experience where the scheduler canceled runs indiscriminately. You’d have to log into dbt Cloud to manually cancel all the queued runs and change the job schedule to "unclog" the scheduler queue. + +Now, the dbt Cloud scheduler detects when a scheduled job is set to run too frequently and appropriately cancels runs that don’t need to be processed. Specifically, scheduled jobs can only ever have one run of the job in the queue, and if a more recent run gets queued, the early queued run will get canceled with a helpful error message. Users will still need to either refactor the job so it runs faster or change the job schedule to run less often if the job often gets into an over-scheduled state. diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md b/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md new file mode 100644 index 00000000000..613a0c02432 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/09-April-2023/starburst-trino-ga.md @@ -0,0 +1,12 @@ +--- +title: "Integrate Starburst and Trino with dbt-trino in dbt Cloud" +id: "starburst-trino-ga" +sidebar_label: "New: Integrate Starburst and Trino in dbt Cloud" +sidebar_position: 8 +tags: [Apr-2023] +--- + +The Starburst (Trino compatible) connection is now generally available in dbt Cloud. This means you can now use dbt Cloud to connect with Starburst Galaxy, Starburst Enterprise, and self-hosted Trino. This feature is powered by the [`dbt-trino`](https://github.com/starburstdata/dbt-trino) adapter. + +To learn more, check out our Quickstart guide for [dbt Cloud and Starburst Galaxy](https://docs.getdbt.com/quickstarts/starburst-galaxy). + diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md new file mode 100644 index 00000000000..b11bf702330 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/1.0-deprecation.md @@ -0,0 +1,21 @@ +--- +title: "dbt Cloud requires dbt version 1.0 or later" +id: "1.0-deprecation" +description: "Mar 2023: dbt Cloud now requires dbt version 1.0 or later and non-compliant jobs or environments were automatically upgraded to v1.4" +sidebar_label: "Deprecation: dbt Cloud requires dbt v1.0 or later." +tags: [Mar-2023] +--- + + +dbt Cloud now requires dbt version 1.0 or later. As of March 1, 2023, we removed all instances of older dbt versions from dbt Cloud. + +Any environments or jobs configured with a dbt version lower than 1.0 were automatically updated to dbt v1.4, which is the latest minor version available on dbt Cloud. + +For more info on dbt versions, releases, and dbt Cloud support timeline, refer to [About dbt Core versions](/docs/dbt-versions/core#latest-releases). + +Refer to some additional info and resources to help you upgrade your dbt version: + +- [How to upgrade dbt without fear](https://docs.getdbt.com/blog/upgrade-dbt-without-fear) +- [Upgrade Q&A on breaking changes](/docs/dbt-versions/upgrade-core-in-cloud#upgrading-legacy-versions-under-10) +- [Version migration guides](/guides/migration/versions) + diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/apiv2-limit.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/apiv2-limit.md new file mode 100644 index 00000000000..85c4af48b54 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/apiv2-limit.md @@ -0,0 +1,14 @@ +--- +title: "API requests have a maximum limit of `100`" +id: apiv2-limit" +description: "Mar 2023: In order to ease pressure on our API, we have implemented a maximum limit of `100` for all API requests to our `list` endpoints. This limit is applicable to multi-tenant instances only." +sidebar_label: "Update: API requests have a maximum limit of `100`" +tags: [Mar-2023, API] +--- + + +To make the API more scalable and reliable, we've implemented a maximum limit of `100` for all API requests to our `list` endpoints. If API requests exceed the maximum limit parameter of `100`, a user will receive an API error message. + +This maximum limit applies to [multi-tenant instances](/docs/cloud/about-cloud/regions-ip-addresses) only, and _does not_ apply to single tenant instances. + +Refer to the [Pagination](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#section/Pagination) section for more information on this change. diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/mar-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/mar-ide-updates.md new file mode 100644 index 00000000000..d72d95e9b23 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/mar-ide-updates.md @@ -0,0 +1,50 @@ +--- +title: "March IDE updates and fixes" +id: "mar-ide-updates" +description: "Mar 2023 release note: We've enhanced the IDE by adding add common dbt commands to the command palette, creating PRs even if you have uncommitted changes, autocompleting suggestions when editing a yml file, editing directly in the git diff view, improved the DAG selector, upgraded sqlfmt, improved syntax error messaging, and more." +sidebar_label: "Update and fixes: IDE" +tags: [Mar-2023, IDE] +--- + +To continue improving your [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) development experience, the dbt Labs team continue to work on adding new features, fixing bugs, and increasing reliability ✨. + +Read more about the [upcoming improvements to the Cloud IDE](https://www.getdbt.com/blog/improvements-to-the-dbt-cloud-ide/) and stay up-to-date with [IDE-related changes](https://docs.getdbt.com/tags/ide). + + +## New features + +- Commit and revert individual files under **Version Control**. +- Use the [command palette](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#cloud-ide-features) to invoke common complex dbt commands, such as resuming from the last failure. +- Create PRs even when there are uncommitted changes (under the **git** dropdown). +- The IDE will display more autocomplete suggestions when editing a YML file, powered by [dbt-jsonschema](https://github.com/dbt-labs/dbt-jsonschema). +- The file tree now has additional options in the right-click menu, such as Copy model as ref or Copy file path. +- The DAG view has been adjusted to a default of `2+model+2`. +- A lineage selector has been implemented in the DAG/lineage sub-tab. +- Edit directly in the git diff view located in the right pane. +- A warning message will now appear when users press Command-W/Control-W when there are unsaved changes. +- A new onboarding flow guide is now available. + +## Product refinements + +- The DAG selector now uses `name` instead of `file_uri` to build selectors. +- The DAG is now vertically centered under the new Selector Input element +- sqlfmt has been upgraded to v0.17.0. +- When the Format button fails, a toast notification will display a syntax error. +- The editor now has the option to toggle minimap/word-wrap via right-click. +- The history drawer displays elapsed time in real-time and s/m/h increments. +- When deleting development environments, the delete modal will now warn users that any uncommitted changes will be lost. +- The context for the Git button has been adjusted to show that it will link to an external site (such as GitHub or GitLab) when users create a pull request. + +## Bug fixes + +- The IDE now displays an error message when the git repository is not reachable. Previously, it failed silently. +- The kebab menu is now visible when the invocation history drawer is open. Previously, it wasn't showing. +- DAGs are now updated/populated consistently. Previously, it occasionally failed. +- The purple highlight for DAG selection is now consistent across files. Previously, it was inconsistent. +- Users can now rename files back to their original name. Previously, this wasn't possible. +- The link to the IDE from the project setup page has been corrected. +- The IDE no longer has issues with single-space file names. +- Adding invalid characters in the sub-directory config no longer causes the IDE to fail. +- YML autocomplete triggers consistently now. Previously, it occasionally didn't trigger. +- Reverting single files now reloads the file contents in the tab. Previously, it didn't reload. +- The file tree no longer collapses on the first click when there is a project subdirectory defined. diff --git a/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md new file mode 100644 index 00000000000..bf3840a8b02 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/10-Mar-2023/public-preview-trino-in-dbt-cloud.md @@ -0,0 +1,17 @@ +--- +title: "Starbust integration with public preview of dbt-trino in dbt Cloud" +id: "public-preview-trino-in-dbt-cloud" +description: "Public preview of dbt Cloud integrations with Starburst and Trino clusters is now available." +sidebar_label: "New: Starbust integration with public preview of dbt-trino in dbt Cloud" +tags: [Mar-2023] +--- + +dbt Labs is introducing the newest connection option in dbt Cloud: the `dbt-trino` adapter is now available in Public Preview. This allows you to connect to Starburst Galaxy, Starburst Enterprise, and self-hosted Trino from dbt Cloud. + +Check out our [Quickstart for dbt Cloud and Starburst Galaxy](/quickstarts/starburst-galaxy) to explore more. + +## What’s the reason users should be excited about this? + +By using Starburst or Trino as your data platform in dbt Cloud, you can now seamlessly transform data that's distributed across multiple sources, including object storage, data lakes, and data warehouses, with no data movement required. + +The seamless integration means you can now get up and running within minutes. diff --git a/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/feb-ide-updates.md b/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/feb-ide-updates.md new file mode 100644 index 00000000000..d52ad2d4081 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/feb-ide-updates.md @@ -0,0 +1,41 @@ +--- +title: "Feb IDE updates and fixes" +id: "feb-ide-updates" +description: "Feb 2023 release note: We've enhanced the IDE by adding custom node colors in the DAG, ref autocomplete, double-click files to rename them, add link to repo from the branch name, enabled syntax highlighting for jinja, improve file tree render time, and more." +sidebar_label: "Update and fixes: IDE" +tags: [Feb-2023, IDE] +--- + +To continue improving our [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) experience, the dbt Labs team worked on fixing bugs, increasing reliability, and adding new features ✨. + +Learn more about the [February changes](https://getdbt.slack.com/archives/C03SAHKKG2Z/p1677605383451109). + +## New features + +- Support for custom node colors in the IDE DAG visualization +- Autosave prototype is now available under feature flag. [Contact](mailto:cloud-ide-feedback@dbtlabs.com) the dbt Labs IDE team to try this out +- Ref autocomplete includes models from seeds and snapshots +- Prevent menus from getting cropped (git controls dropdown, file tree dropdown, build button, editor tab options) +- Additional option to access the file menu by right-clicking on the files and folders in the file tree +- Rename files by double-clicking on files in the file tree and the editor tabs +- Right-clicking on file tabs has new options and will now open at your cursor instead of in the middle of the tab +- The git branch name above **Version Control** links to the repo for specific git providers + * Currently available for all [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) instances using GitHub or GitLab providers + +## Product refinements + +- Added an error modal for RPC parsing errors when users attempt to invoke dbt commands (preview, compile, or general dbt invocations) +- Enabled syntax highlighting for Jinja expression and statement delimiters +- Clarified and renamed the options under the **Build** button +- Changed the term for RPC status from `Compiling` to `Parsing` to match dbt-core construct +- Implemented a new File Tree component to improve render time by 60% +- Disabled the Local Storage of File Tree to prevent users from running into max LocalStorage issue for large projects +- Changed snapshot snippet template (`__snapshot`) to a select from source + +## Bug fixes + +- You no longer have file contents carrying over when you switch to a different project that has the same file name +- The preview max limit no longer allows you to override the maximum +- You no longer encounter node statuses failing to update in the history drawer for those on version 1.4 core. (This is a partial fix that may be fully addressed by core version 1.5) +- You can now use the **Copy File Name** option to copy up to the last dot, rather than the first dot + diff --git a/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/no-partial-parse-config.md b/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/no-partial-parse-config.md new file mode 100644 index 00000000000..0727c5eb339 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/11-Feb-2023/no-partial-parse-config.md @@ -0,0 +1,11 @@ +--- +title: "Disable partial parsing in dbt Cloud job commands" +id: "no-partial-parse-config" +description: "You can now disable partial parsing in dbt Cloud job commands." +sidebar_label: "New: Disable partial parsing in job commands" +tags: [Feb-2023] +--- + +You can now use the `--no-partial-parse` flag to disable partial parsing in your dbt Cloud job commands.  + +Previously, the [`--no-partial-parse` global config](/reference/global-configs/parsing) was only available in dbt Core. For more information, refer to [partial parsing](/reference/parsing#partial-parsing). diff --git a/website/docs/docs/dbt-versions/release-notes/12-Jan-2023/ide-updates.md b/website/docs/docs/dbt-versions/release-notes/12-Jan-2023/ide-updates.md new file mode 100644 index 00000000000..e8b3dcc4caa --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/12-Jan-2023/ide-updates.md @@ -0,0 +1,43 @@ +--- +title: "Jan IDE updates and fixes" +id: "ide-updates" +description: "Jan 2023 release note: We've enhanced the IDE with improved syntax highlighting, faster and snappier IDE, improved error messaging, view repo status, added an easter egg, and more." +sidebar_label: "Update and fixes: IDE" +tags: [Jan-2023, IDE] +--- + +In the spirit of continuing to improve our [Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) experience, the dbt Labs team worked on fixing bugs, increasing reliability, and adding new features ✨. + +Learn more about the [January changes](https://getdbt.slack.com/archives/C03SAHKKG2Z/p1675272600286119) and what's coming soon. + +## New features + +- Improved syntax highlighting within the IDE for better Jinja-SQL combination (double quotes now show proper syntax highlight!) +- Adjusted the routing URL for the IDE page and removed the `next` from the URL +- Added a *new* easter egg within the IDE 🐶🦆 + +## Product refinements + +- Performance improvements and reduced IDE slowness. The IDE should feel faster and snappier. +- Reliability improvements – Improved error handling that previously put IDE in a bad state +- Corrected the list of dropdown options for the Build button +- Adjusted startup page duration +- Added code snippets for `unique` and `not_null` tests for yml files +- Added code snippets for metrics based on environment dbt versions +- Changed “commit and push” to “commit and sync” to better reflect the action +- Improved error message when saving or renaming files to duplicate names + +## Bug fixes + +- You no longer arbitrarily encounter an `RPC server got an unknown async ID` message +- You can now see the build button dropdown, which had been hidden behind the placeholder DAG screen +- You can now close toast notifications for command failure when the history drawer is open +- You no longer encounter a `Something went wrong` message when previewing a model +- You can now see repository status in the IDE, and the IDE finds the SSH folder +- Scroll bars and download CSV no longer flicker within the preview pane + +## Coming soon + +- dbt Labs will roll out a series of features that should improve the quality of life within the IDE over the next several months, such as autosave, the ability to revert individual files, and user experience improvements, like right-clicking. + +- dbt Labs is researching ways to include Linting/SQL Fluff in the IDE. If anyone is interested in sharing how you're using SQL Fluff today, please reach out to [dbt Labs IDE team](mailto:cloud-ide-feedback@dbtlabs.com). diff --git a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/default-thread-value.md b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/default-thread-value.md new file mode 100644 index 00000000000..90f3156599c --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/default-thread-value.md @@ -0,0 +1,15 @@ +--- +title: "Threads default value changed to 4" +id: "default-thread-value" +description: "Threads now default to 4 users' profile." +sidebar_label: "Enhancement: Threads default value changed to 4 " +tags: [Dec-2022] +--- + +Threads help parallelize node execution in the dbt directed acyclic graph [(DAG)](https://docs.getdbt.com/terms/dag). + +Previously, the thread value defaulted to 1, which can increase the runtime of your project. To help reduce the runtime of your project, the default value for threads in user profiles is now set to 4 threads. + +You can supply a custom thread count if you'd prefer more or less parallelization. + +For more information, read [Understanding threads](/docs/running-a-dbt-project/using-threads). diff --git a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md new file mode 100644 index 00000000000..bdc89b4abde --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/new-jobs-default-as-off.md @@ -0,0 +1,14 @@ +--- +title: "Creating a new job no longer triggers a run by default" +id: "new-jobs-default-as-off.md" +description: "You need to click **Run on schedule** before a job will be scheduled to run" +sidebar_label: "Improvement: New jobs no longer run by default" +tags: [Dec-2022] +--- + +To help save compute time, new jobs will no longer be triggered to run by default. When you create a new job in dbt Cloud, you can trigger the job to run by selecting **Run on schedule** and completing the desired schedule and timing information. + +For more information, refer to [Deploy jobs](/docs/deploy/deploy-jobs). + + + diff --git a/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/private-packages-clone-git-token.md b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/private-packages-clone-git-token.md new file mode 100644 index 00000000000..75db9d1888b --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/23-Dec-2022/private-packages-clone-git-token.md @@ -0,0 +1,25 @@ +--- +title: "Private packages must be cloned using access tokens provided by environment variables" +description: "Private GitHub packages must be cloned using access tokens provided by environment variables." +sidebar_label: "Deprecation: Private packages must be cloned using access tokens" +tags: [Dec-2022] +--- + +The supported method for cloning private GitHub packages is the [git token method](/docs/build/packages#git-token-method), where an appropriate access token is passed into the package repository URL with an environment variable. + +A small number of people have been able to clone private packages using dbt's native GitHub application without explicitly providing an access token. This functionality is being deprecated as it’s limited in flexibility. + +If you have been using a package hosted in a private repository on GitHub, you must start passing an access token into the URL. + +An example of passing an access token: + + + +```yaml + +packages: +- git: "https://{{env_var('DBT_ENV_SECRET_GIT_CREDENTIAL')}}@github.com/dbt-labs/awesome_repo.git" + +``` + + diff --git a/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md new file mode 100644 index 00000000000..25d5ca5205f --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/dbt-databricks-unity-catalog-support.md @@ -0,0 +1,13 @@ +--- +title: "The dbt Cloud + Databricks experience is getting even better" +id: "cloud-databricks-unitycatalog" +description: "The dbt-databricks adapter is now available for use with dbt Cloud" +sidebar_label: "Enhancement: Support for Databricks Unity Catalog with dbt-databricks" +tags: [Nov-2022, v1.1.66.15] +--- + +dbt Cloud is the easiest and most reliable way to develop and deploy a dbt project. It helps remove complexity while also giving you more features and better performance. A simpler Databricks connection experience with support for Databricks’ Unity Catalog and better modeling defaults is now available for your use. + +For all the Databricks customers already using dbt Cloud with the dbt-spark adapter, you can now [migrate](https://docs.getdbt.com/guides/migration/tools/migrating-from-spark-to-databricks#migration) your connection to the [dbt-databricks adapter](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to get the benefits. [Databricks](https://www.databricks.com/blog/2022/11/17/introducing-native-high-performance-integration-dbt-cloud.html) is committed to maintaining and improving the adapter, so this integrated experience will continue to provide the best of dbt and Databricks. + +Check out our [live blog post](https://www.getdbt.com/blog/dbt-cloud-databricks-experience/) to learn more. diff --git a/website/docs/docs/dbt-versions/release-notes/04-Nov-2022/ide-features-ide-deprecation.md b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/ide-features-ide-deprecation.md similarity index 54% rename from website/docs/docs/dbt-versions/release-notes/04-Nov-2022/ide-features-ide-deprecation.md rename to website/docs/docs/dbt-versions/release-notes/24-Nov-2022/ide-features-ide-deprecation.md index becad55356c..87a7b179b6d 100644 --- a/website/docs/docs/dbt-versions/release-notes/04-Nov-2022/ide-features-ide-deprecation.md +++ b/website/docs/docs/dbt-versions/release-notes/24-Nov-2022/ide-features-ide-deprecation.md @@ -1,10 +1,13 @@ --- -title: "Extra features in new IDE, and classic IDE deprecation" +title: "New IDE features and classic IDE deprecation" id: "ide-features-ide-deprecation" -description: "Enhancement and Deprecation: Extra features in new IDE, and classic IDE deprecation" -sidebar_label: "Enhancement and deprecation: Extra features in the new IDE and classic IDE deprecation" -tags: [Nov-29-2022, v1.1.67.0] +description: "Nov 2022 release note: We've enhanced the IDE by adding a button to automatically format your SQL. Added dark mode, Git diff view, and 4 new autocomplete options. We have deprecated the classic IDE." +sidebar_label: "Enhancement and deprecation: New IDE features and classic IDE deprecation" +tags: + - Nov-2022 + - v1.1.67.0 + - IDE --- @@ -23,10 +26,12 @@ The new features are: - Use `env var` to autocomplete env var - **Dark mode** — Use dark mode in the dbt Cloud IDE for low-light environments. -Read more about all the [Cloud IDE features](/docs/get-started/dbt-cloud-features). +Read more about all the [Cloud IDE features](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#cloud-ide-features). ### Classic IDE deprecation notice -In December 2022, dbt Labs will deprecate the classic IDE. The [new and refreshed IDE](/docs/get-started/develop-in-the-cloud) will be available for _all_ dbt Cloud users. You will no longer be able to access the classic IDE and dbt Labs might introduce changes that break the classic IDE. +In December 2022, dbt Labs will deprecate the classic IDE. The [new and refreshed IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) will be available for _all_ dbt Cloud users. You will no longer be able to access the classic IDE and dbt Labs might introduce changes that break the classic IDE. With deprecation, dbt Labs will only support the refreshed version of the dbt Cloud IDE. + +Virtual Private Cloud (VPC) customers with questions about when this change will affect your account can contact your account team or support contact for assistance. diff --git a/website/docs/docs/dbt-versions/release-notes/05-Oct-2022/cloud-integration-azure.md b/website/docs/docs/dbt-versions/release-notes/25-Oct-2022/cloud-integration-azure.md similarity index 86% rename from website/docs/docs/dbt-versions/release-notes/05-Oct-2022/cloud-integration-azure.md rename to website/docs/docs/dbt-versions/release-notes/25-Oct-2022/cloud-integration-azure.md index 70170b0410f..513c2ef1258 100644 --- a/website/docs/docs/dbt-versions/release-notes/05-Oct-2022/cloud-integration-azure.md +++ b/website/docs/docs/dbt-versions/release-notes/25-Oct-2022/cloud-integration-azure.md @@ -3,10 +3,10 @@ title: "Announcing dbt Cloud’s native integration with Azure DevOps" id: "cloud-integration-azure" description: "dbt Cloud native integration with Azure DevOps" sidebar_label: "Improvement: Native integration with Azure DevOps" -tags: [Oct-11-2022, v1.1.64] +tags: [Oct-2022, v1.1.64] --- -dbt Cloud now offers a native integration with Azure DevOps for dbt Cloud customers on the enterprise plan. We built this integration to remove friction, increase security, and unlock net new product experiences for our customers. [Setting up the Azure DevOps integration](/docs/collaborate/git/connect-azure-devops) in dbt Cloud provides: +dbt Cloud now offers a native integration with Azure DevOps for dbt Cloud customers on the enterprise plan. We built this integration to remove friction, increase security, and unlock net new product experiences for our customers. [Setting up the Azure DevOps integration](/docs/cloud/git/connect-azure-devops) in dbt Cloud provides: - easy dbt project set up, - an improved security posture, - repo permissions enforcement in dbt Cloud IDE, and diff --git a/website/docs/docs/dbt-versions/release-notes/05-Oct-2022/new-ide-launch.md b/website/docs/docs/dbt-versions/release-notes/25-Oct-2022/new-ide-launch.md similarity index 67% rename from website/docs/docs/dbt-versions/release-notes/05-Oct-2022/new-ide-launch.md rename to website/docs/docs/dbt-versions/release-notes/25-Oct-2022/new-ide-launch.md index d57d051540c..aaf5f670e09 100644 --- a/website/docs/docs/dbt-versions/release-notes/05-Oct-2022/new-ide-launch.md +++ b/website/docs/docs/dbt-versions/release-notes/25-Oct-2022/new-ide-launch.md @@ -1,9 +1,9 @@ --- title: "Enhancement: New Cloud IDE launch" id: "new-ide-launch" -description: "Enhancement: New Cloud IDE launch" +description: "Oct 2022 release note: Introducing a new dbt Cloud IDE that's faster, has performance upgrades, ergonomics improvements, and other delightful enhancements." sidebar_label: "Snappier, faster, and new Cloud IDE" -tags: [Oct-18-2022] +tags: [Oct-2022, IDE] --- ## Introducing a snappier, improved, and powerful Cloud IDE @@ -18,4 +18,4 @@ Some of the improvements include: - Better organization and navigation with features like drag and drop of files, breadcrumb, build button drop-down, and more. - You can use new features like auto-format your file, auto-complete model names, and git diff view to see your changes before making a pull request. -Read more about the new [Cloud IDE features](/docs/get-started/dbt-cloud-features) and check out [New and improved Cloud IDE](https://www.getdbt.com/blog/new-improved-cloud-ide/) blog for more info! +Read more about the new [Cloud IDE features](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#cloud-ide-features) and check out [New and improved Cloud IDE](https://www.getdbt.com/blog/new-improved-cloud-ide/) blog for more info! diff --git a/website/docs/docs/dbt-versions/release-notes/26-Sept-2022/liststeps-endpoint-deprecation.md b/website/docs/docs/dbt-versions/release-notes/26-Sept-2022/liststeps-endpoint-deprecation.md new file mode 100644 index 00000000000..545847efd90 --- /dev/null +++ b/website/docs/docs/dbt-versions/release-notes/26-Sept-2022/liststeps-endpoint-deprecation.md @@ -0,0 +1,15 @@ +--- +title: "List Steps API endpoint deprecation warning" +id: "liststeps-endpoint-deprecation.md" +description: "List Steps API deprecation" +sidebar_label: "Deprecation: List Steps API endpoint" +tags: [Sept-2022] +--- + +On October 14th, 2022 dbt Labs is deprecating the [List Steps](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Runs/operation/listSteps) API endpoint. From October 14th, any GET requests to this endpoint will fail. Please prepare to stop using the List Steps endpoint as soon as possible. + +dbt Labs will continue to maintain the [Get Run](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Runs/operation/getRunById) endpoint, which is a viable alternative depending on the use case. + +You can fetch run steps for an individual run with a GET request to the following URL, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: + +`https://YOUR_ACCESS_URL/api/v2/accounts/{accountId}/runs/{runId}/?include_related=["run_steps"]` diff --git a/website/docs/docs/dbt-versions/release-notes/05-Sept-2022/metadata-api-data-retention-limits.md b/website/docs/docs/dbt-versions/release-notes/26-Sept-2022/metadata-api-data-retention-limits.md similarity index 83% rename from website/docs/docs/dbt-versions/release-notes/05-Sept-2022/metadata-api-data-retention-limits.md rename to website/docs/docs/dbt-versions/release-notes/26-Sept-2022/metadata-api-data-retention-limits.md index 20a6edb1c26..3ced6fecedd 100644 --- a/website/docs/docs/dbt-versions/release-notes/05-Sept-2022/metadata-api-data-retention-limits.md +++ b/website/docs/docs/dbt-versions/release-notes/26-Sept-2022/metadata-api-data-retention-limits.md @@ -3,9 +3,9 @@ title: "Query the previous three months of data using the metadata API" id: "metadata-api-data-retention-limits.md" description: "Metadata API data retention limits" sidebar_label: "Fix: Metadata API data retention limits" -tags: [Sept-29-2022] +tags: [Sept-2022] --- In order to make the metadata API more scalable and improve its latency, we’ve implemented data retention limits. The metadata API can now query data from the previous three months. For example, if today was March 1, you could query data back to January 1st. -For more information, see "[Metadata API](/docs/dbt-cloud-apis/metadata-api)" +For more information, see [Metadata API](/docs/dbt-cloud-apis/discovery-api) diff --git a/website/docs/docs/dbt-versions/release-notes/06-Aug-2022/ide-improvement-beta.md b/website/docs/docs/dbt-versions/release-notes/27-Aug-2022/ide-improvement-beta.md similarity index 93% rename from website/docs/docs/dbt-versions/release-notes/06-Aug-2022/ide-improvement-beta.md rename to website/docs/docs/dbt-versions/release-notes/27-Aug-2022/ide-improvement-beta.md index 9cb5827e9de..aa2c5ee4fa7 100644 --- a/website/docs/docs/dbt-versions/release-notes/06-Aug-2022/ide-improvement-beta.md +++ b/website/docs/docs/dbt-versions/release-notes/27-Aug-2022/ide-improvement-beta.md @@ -1,9 +1,9 @@ --- title: "Enhancement: New Cloud IDE beta" id: "ide-improvements-beta.md" -description: "Adding IDE performance and reliability improvements" +description: "Aug 2022 release note: Launch of the IDE beta, which focuses on performance and reliability improvements." sidebar_label: "Enhancement: New cloud IDE beta" -tags: [Aug-16-2022] +tags: [Aug-2022, IDE] --- :::info Beta feature diff --git a/website/docs/docs/dbt-versions/release-notes/06-Aug-2022/support-redshift-ra3.md b/website/docs/docs/dbt-versions/release-notes/27-Aug-2022/support-redshift-ra3.md similarity index 96% rename from website/docs/docs/dbt-versions/release-notes/06-Aug-2022/support-redshift-ra3.md rename to website/docs/docs/dbt-versions/release-notes/27-Aug-2022/support-redshift-ra3.md index b70783a2ed9..f48a8539923 100644 --- a/website/docs/docs/dbt-versions/release-notes/06-Aug-2022/support-redshift-ra3.md +++ b/website/docs/docs/dbt-versions/release-notes/27-Aug-2022/support-redshift-ra3.md @@ -3,7 +3,7 @@ title: "Enhancement: Support for cross-database sources on Redshift RA3 instance id: "support-redshift-ra3.md" description: "Adding support for cross-database queries for RA3" sidebar_label: "Enhancement: Support for RA3" -tags: [Aug-31-2022, 1.1.61.5] +tags: [Aug-2022, 1.1.61.5] --- diff --git a/website/docs/docs/dbt-versions/release-notes/07-July-2022/render-lineage-feature.md b/website/docs/docs/dbt-versions/release-notes/28-July-2022/render-lineage-feature.md similarity index 82% rename from website/docs/docs/dbt-versions/release-notes/07-July-2022/render-lineage-feature.md rename to website/docs/docs/dbt-versions/release-notes/28-July-2022/render-lineage-feature.md index cc2478ee11e..2d0488d4488 100644 --- a/website/docs/docs/dbt-versions/release-notes/07-July-2022/render-lineage-feature.md +++ b/website/docs/docs/dbt-versions/release-notes/28-July-2022/render-lineage-feature.md @@ -1,9 +1,9 @@ --- title: "Enhancement: Large DAG feature" id: "render-lineage-feature" -description: "Add a render button to visualize large DAGs" +description: "Jul 2022 release note: Use the Render Lineage button to visualize large DAGs" sidebar_label: "Enhancement: Large DAG feature" -tags: [July-5-2022, v1.1.56] +tags: [July-2022, v1.1.56, IDE] --- diff --git a/website/docs/docs/dbt-versions/release-notes/08-May-2022/gitlab-auth.md b/website/docs/docs/dbt-versions/release-notes/29-May-2022/gitlab-auth.md similarity index 96% rename from website/docs/docs/dbt-versions/release-notes/08-May-2022/gitlab-auth.md rename to website/docs/docs/dbt-versions/release-notes/29-May-2022/gitlab-auth.md index d468f557fa2..4e7c9ec3ea3 100644 --- a/website/docs/docs/dbt-versions/release-notes/08-May-2022/gitlab-auth.md +++ b/website/docs/docs/dbt-versions/release-notes/29-May-2022/gitlab-auth.md @@ -3,7 +3,7 @@ title: "Refresh expired access tokens in the IDE when using GitLab" id: "gitlab-auth" description: "Adding support for expiring OAuth access tokens." sidebar_label: "Enhancement: Refreshing GitLab OAuth Access" -tags: [May-19-2022, v1.1.52] +tags: [May-2022, v1.1.52] --- On May 22, GitLab changed how they treat [OAuth access tokens that don't expire](https://docs.gitlab.com/ee/update/deprecations.html#oauth-tokens-without-expiration). We updated our IDE logic to handle OAuth token expiration more gracefully. Now, the first time your token expires after 2 hours of consecutive IDE usage, you will have to re-authenticate in GitLab to refresh your expired OAuth access token. We will handle subsequent refreshes for you if you provide the authorization when you re-authenticate. diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2022/audit-log.md b/website/docs/docs/dbt-versions/release-notes/30-April-2022/audit-log.md similarity index 82% rename from website/docs/docs/dbt-versions/release-notes/09-April-2022/audit-log.md rename to website/docs/docs/dbt-versions/release-notes/30-April-2022/audit-log.md index cf16d033868..d0cc9208e1e 100644 --- a/website/docs/docs/dbt-versions/release-notes/09-April-2022/audit-log.md +++ b/website/docs/docs/dbt-versions/release-notes/30-April-2022/audit-log.md @@ -10,4 +10,4 @@ tags: [April-26-2022] To review actions performed by people in your organization, dbt provides logs of audited user and system events. The dbt Cloud audit log lists events triggered in your organization within the last 90 days. -The audit log includes details such as who performed the action, what the action was, and when it was performed. For more details, review [the audit log for dbt Cloud Enterprise](/docs/collaborate/manage-access/audit-log) documentation. +The audit log includes details such as who performed the action, what the action was, and when it was performed. For more details, review [the audit log for dbt Cloud Enterprise](/docs/cloud/manage-access/audit-log) documentation. diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2022/credentials-saved.md b/website/docs/docs/dbt-versions/release-notes/30-April-2022/credentials-saved.md similarity index 94% rename from website/docs/docs/dbt-versions/release-notes/09-April-2022/credentials-saved.md rename to website/docs/docs/dbt-versions/release-notes/30-April-2022/credentials-saved.md index 971c83a4a2a..388f133e861 100644 --- a/website/docs/docs/dbt-versions/release-notes/09-April-2022/credentials-saved.md +++ b/website/docs/docs/dbt-versions/release-notes/30-April-2022/credentials-saved.md @@ -3,7 +3,7 @@ title: "Credentials no longer accidentally wiped when editing an environment" id: "credentials-saved" description: "Credentials are now saved when editing an environment." sidebar_label: "Fix: Credentials saved" -tags: [April-29-2022 v1.1.51] +tags: [April-29-2022, v1.1.51] --- We resolved a bug where when updating unencrypted fields (e.g. threads, schema name) in an environment setting would cause secret fields (e.g. password, keypair, credential details) to be deleted from that environment. Now users can freely update environment settings without fear of unintentionally wiping credentials. diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2022/email-verification.md b/website/docs/docs/dbt-versions/release-notes/30-April-2022/email-verification.md similarity index 100% rename from website/docs/docs/dbt-versions/release-notes/09-April-2022/email-verification.md rename to website/docs/docs/dbt-versions/release-notes/30-April-2022/email-verification.md diff --git a/website/docs/docs/dbt-versions/release-notes/09-April-2022/scheduler-improvements.md b/website/docs/docs/dbt-versions/release-notes/30-April-2022/scheduler-improvements.md similarity index 100% rename from website/docs/docs/dbt-versions/release-notes/09-April-2022/scheduler-improvements.md rename to website/docs/docs/dbt-versions/release-notes/30-April-2022/scheduler-improvements.md diff --git a/website/docs/docs/dbt-versions/release-notes/10-March-2022/ide-timeout-message.md b/website/docs/docs/dbt-versions/release-notes/31-March-2022/ide-timeout-message.md similarity index 73% rename from website/docs/docs/dbt-versions/release-notes/10-March-2022/ide-timeout-message.md rename to website/docs/docs/dbt-versions/release-notes/31-March-2022/ide-timeout-message.md index 2f1fa321753..307786c6b85 100644 --- a/website/docs/docs/dbt-versions/release-notes/10-March-2022/ide-timeout-message.md +++ b/website/docs/docs/dbt-versions/release-notes/31-March-2022/ide-timeout-message.md @@ -1,9 +1,9 @@ --- title: "Spotty internet issues no longer cause a session time out message" id: "ide-timeout-message" -description: "We fixed an issue where a spotty internet connection could cause the “IDE session timed out” message to appear unexpectedly. People using a VPN were most likely to see this issue." +description: "Mar 2022 release note: We fixed an issue where a spotty internet connection could cause the “IDE session timed out” message to appear unexpectedly. People using a VPN were most likely to see this issue." sidebar_label: "Fix: Session time out" -tags: [v1.1.47, March-10-2022] +tags: [v1.1.47, March-10-2022, IDE] --- We fixed an issue where a spotty internet connection could cause the “IDE session timed out” message to appear unexpectedly. People using a VPN were most likely to see this issue. diff --git a/website/docs/docs/dbt-versions/release-notes/10-March-2022/prep-and-waiting-time.md b/website/docs/docs/dbt-versions/release-notes/31-March-2022/prep-and-waiting-time.md similarity index 100% rename from website/docs/docs/dbt-versions/release-notes/10-March-2022/prep-and-waiting-time.md rename to website/docs/docs/dbt-versions/release-notes/31-March-2022/prep-and-waiting-time.md diff --git a/website/docs/docs/dbt-versions/release-notes/11-February-2022/DAG-updates-more.md b/website/docs/docs/dbt-versions/release-notes/32-February-2022/DAG-updates-more.md similarity index 78% rename from website/docs/docs/dbt-versions/release-notes/11-February-2022/DAG-updates-more.md rename to website/docs/docs/dbt-versions/release-notes/32-February-2022/DAG-updates-more.md index 57ef3b1a65a..6557d080394 100644 --- a/website/docs/docs/dbt-versions/release-notes/11-February-2022/DAG-updates-more.md +++ b/website/docs/docs/dbt-versions/release-notes/32-February-2022/DAG-updates-more.md @@ -1,9 +1,9 @@ --- title: "DAG updates and performance improvements" -description: "Clicking a node in the DAG opens a model or config file in new tab in the IDE." +description: "Feb 2022 release note: Clicking a node in the DAG opens a model or config file in new tab in the IDE." id: "DAG-updates-more" sidebar_label: "DAG updates and more" -tags: [v1.1.44, February-02-2022] +tags: [v1.1.44, February-02-2022, IDE] --- Love the DAG in the IDE as much as we do? Now when you click on a node in the DAG, the model or config file will open as a new tab in the IDE, so you can directly view or edit the code. We'll continue to ship better developer ergonomic functionality throughout the year. diff --git a/website/docs/docs/dbt-versions/release-notes/11-February-2022/service-tokens-more.md b/website/docs/docs/dbt-versions/release-notes/32-February-2022/service-tokens-more.md similarity index 100% rename from website/docs/docs/dbt-versions/release-notes/11-February-2022/service-tokens-more.md rename to website/docs/docs/dbt-versions/release-notes/32-February-2022/service-tokens-more.md diff --git a/website/docs/docs/dbt-versions/release-notes/12-January-2022/IDE-autocomplete-more.md b/website/docs/docs/dbt-versions/release-notes/33-January-2022/IDE-autocomplete-more.md similarity index 75% rename from website/docs/docs/dbt-versions/release-notes/12-January-2022/IDE-autocomplete-more.md rename to website/docs/docs/dbt-versions/release-notes/33-January-2022/IDE-autocomplete-more.md index f45fddddfd5..c29ed790e8e 100644 --- a/website/docs/docs/dbt-versions/release-notes/12-January-2022/IDE-autocomplete-more.md +++ b/website/docs/docs/dbt-versions/release-notes/33-January-2022/IDE-autocomplete-more.md @@ -1,12 +1,12 @@ --- title: "Autocomplete snippets for SQL and YAML files in IDE" id: "ide-timeout-message" -description: "Autocomplete snippets for SQL and YAML files in IDE" +description: "Jan 2022 release note: Autocomplete snippets for SQL and YAML files in IDE" sidebar_label: "Autocomplete in IDE and more" -tags: [v1.1.43, January-19-2022] +tags: [v1.1.43, January-19-2022, IDE] --- -Some noteworthy improvements include autocomplete snippets for sql and yaml files in the IDE, which are available for use now! We also added a [new metric layer page](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-metrics-layer) to docs.getdbt.com to help you begin thinking about the metrics layer in dbt Cloud. +Some noteworthy improvements include autocomplete snippets for sql and YAML files in the IDE, which are available for use now! We also added a [new metric layer page](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-metrics-layer) to docs.getdbt.com to help you begin thinking about the metrics layer in dbt Cloud. #### Performance improvements and enhancements diff --git a/website/docs/docs/dbt-versions/release-notes/12-January-2022/model-timing-more.md b/website/docs/docs/dbt-versions/release-notes/33-January-2022/model-timing-more.md similarity index 100% rename from website/docs/docs/dbt-versions/release-notes/12-January-2022/model-timing-more.md rename to website/docs/docs/dbt-versions/release-notes/33-January-2022/model-timing-more.md diff --git a/website/docs/docs/dbt-versions/release-notes/14-dbt-cloud-changelog-2021.md b/website/docs/docs/dbt-versions/release-notes/34-dbt-cloud-changelog-2021.md similarity index 99% rename from website/docs/docs/dbt-versions/release-notes/14-dbt-cloud-changelog-2021.md rename to website/docs/docs/dbt-versions/release-notes/34-dbt-cloud-changelog-2021.md index cf350b78778..996229807a1 100644 --- a/website/docs/docs/dbt-versions/release-notes/14-dbt-cloud-changelog-2021.md +++ b/website/docs/docs/dbt-versions/release-notes/34-dbt-cloud-changelog-2021.md @@ -4,7 +4,7 @@ id: "dbt-cloud-changelog-2021" sidebar_label: Changelog (2021) description: "2021 Changelog for the dbt Cloud application" -tags: [v1.1.41, v1.1.40, v1.1.39, v1.1.38, v1.1.37, v1.1.36, v1.1.35, v1.1.34, v1.1.33, v1.1.32, v1.1.31, v1.1.30, v1.1.29, v1.1.28, v1.1.27, v1.1.26, v1.1.25, v1.1.24, v1.1.23, v1.1.22, v1.1.21, v1.1.20, v1.1.19, v1.1.18] +tags: [v1.1.41, v1.1.40, v1.1.39, v1.1.38, v1.1.37, v1.1.36, v1.1.35, v1.1.34, v1.1.33, v1.1.32, v1.1.31, v1.1.30, v1.1.29, v1.1.28, v1.1.27, v1.1.26, v1.1.25, v1.1.24, v1.1.23, v1.1.22, v1.1.21, v1.1.20, v1.1.19, v1.1.18, Jan-1-2021] --- Welcome to the 2021 changelog for the dbt Cloud application! You can use this changelog to see highlights of what was new, fixed, and enhanced. @@ -103,8 +103,8 @@ We’ve improved the tabbing experience in the IDE. Tabs now work much more intu #### Performance improvements and enhancements - We've been working on some nice improvements to tabs in our IDE. We’ve fixed deficiencies with tabs that caused users to lose work if they didn’t hit save regularly enough. Additionally, opening, closing, and the order of the tabs work much more smoothly. -- You may have noticed that there is now a source freshness checkbox in your execution settings when you configure a job on dbt Cloud. Selecting this checkbox will run `dbt source freshness` as the first step in your job, but it will not break subsequent steps if it fails. Updated source freshness documentation available [here](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness). -- Added a new endpoint to allow API key rotation via `POST https://cloud.getdbt.com/api/v2/users/{user-id}/apiKey` +- You may have noticed that there is now a source freshness checkbox in your execution settings when you configure a job on dbt Cloud. Selecting this checkbox will run `dbt source freshness` as the first step in your job, but it will not break subsequent steps if it fails. Updated source freshness documentation available [here](/docs/deploy/source-freshness). +- Added a new endpoint to allow API key rotation via `POST https://cloud.getdbt.com/api/v2/users/{user-id}/apikey` ## dbt Cloud v1.1.30 (July 7, 2021) @@ -139,7 +139,7 @@ We shipped a far better experience for GitLab users. Be sure to check out new CI #### New products and features -- `Slim CI`: We’ve made Slim CI available for all our cloud customers! With Slim CI, you don't have to rebuild and test all your models; you can instruct dbt Cloud to run jobs on only modified or new resources. If you are a GitHub or GitLab user, try creating a new job that runs on pull requests and you can signal to dbt to run only on these modified resources by including the `state:modified+` argument. Read more about Slim CI [here](/docs/deploy/cloud-ci-job). +- `Slim CI`: We’ve made Slim CI available for all our cloud customers! With Slim CI, you don't have to rebuild and test all your models; you can instruct dbt Cloud to run jobs on only modified or new resources. If you are a GitHub or GitLab user, try creating a new job that runs on pull requests and you can signal to dbt to run only on these modified resources by including the `state:modified+` argument. Read more about Slim CI [here](/docs/deploy/continuous-integration). - Native GitLab authentication for dbt Cloud Developer and Team Tiers: We’ve shipped native GitLab auth into GA. You can now import new GitLab repos with a couple clicks, trigger CI builds when Merge Requests are opened in GitLab, and carry GitLab permissions through to dbt Cloud IDE's git actions. Read how to set up native GitLab auth [here](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-gitlab). diff --git a/website/docs/docs/dbt-versions/release-notes/13-dbt-cloud-changelog-2019-2020.md b/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md similarity index 99% rename from website/docs/docs/dbt-versions/release-notes/13-dbt-cloud-changelog-2019-2020.md rename to website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md index 7da7dad0a98..b8e15b993de 100644 --- a/website/docs/docs/dbt-versions/release-notes/13-dbt-cloud-changelog-2019-2020.md +++ b/website/docs/docs/dbt-versions/release-notes/35-dbt-cloud-changelog-2019-2020.md @@ -4,7 +4,7 @@ id: "dbt-cloud-changelog-2019-2020" sidebar_label: Changelog (2019 and 2020) description: "2019 and 2020 Changelog for the dbt Cloud application" -tags: [v1.1.16, v1.1.15, v1.1.14, v1.1.13, v1.1.12, v1.1.11, v1.1.10, v1.1.09, v1.1.08, v1.1.07, v1.1.06, v1.1.05, v1.1.04, v1.1.03, v1.1.02, v1.1.01, v0.5.0] +tags: [v1.1.16, v1.1.15, v1.1.14, v1.1.13, v1.1.12, v1.1.11, v1.1.10, v1.1.09, v1.1.08, v1.1.07, v1.1.06, v1.1.05, v1.1.04, v1.1.03, v1.1.02, v1.1.01, v0.5.0, Jan-1-2020] --- Welcome to the 2019 and 2020 changelog for the dbt Cloud application! You can use this changelog to see the highlights of what was new, fixed, and enhanced during this time period. @@ -218,7 +218,7 @@ This release includes security enhancements and improvements across the entire dbt Cloud application. #### Enhancements -- Support for viewing development docs inside of the IDE ([docs](viewing-docs-in-the-ide)) +- Support for viewing development docs inside of the IDE ([docs](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) - Change CI temporary schema names to be prefixed with `dbt_cloud` instead of `sinter` - Change coloring and iconography to improve accessibility and UX across the application - [Enterprise] Support the specification of multiple authorized domains in SSO configuration diff --git a/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md b/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md index f0b99499891..d143aab5ef1 100644 --- a/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md +++ b/website/docs/docs/dbt-versions/upgrade-core-in-cloud.md @@ -3,8 +3,6 @@ title: "Upgrade Core version in Cloud" id: "upgrade-core-in-cloud" --- -## Upgrading to the latest version of dbt in Cloud - In dbt Cloud, both jobs and environments are configured to use a specific version of dbt Core. The version can be upgraded at any time. ### Environments @@ -23,24 +21,21 @@ Each job in dbt Cloud can be configured to inherit parameters from the environme The example job seen in the screenshot above belongs to the environment "Prod". It inherits the dbt version of its environment as shown by the **Inherited from ENVIRONMENT_NAME (DBT_VERSION)** selection. You may also manually override the dbt version of a specific job to be any of the current Core releases supported by Cloud by selecting another option from the dropdown. -## Supported Versions - -We have always encouraged our customers to upgrade dbt Core versions whenever a new minor version is released. We released our first major version of dbt - `dbt 1.0` - in December 2021. Alongside this release, we updated our policy on which versions of dbt Core we will support in dbt Cloud. - +## Supported versions +dbt Labs has always encouraged users to upgrade dbt Core versions whenever a new minor version is released. We released our first major version of dbt - `dbt 1.0` - in December 2021. Alongside this release, we updated our policy on which versions of dbt Core we will support in dbt Cloud. - > **Starting with v1.0, any subsequent minor versions will be supported in dbt Cloud for 1 year post release. At the end of the 1 year window, accounts must upgrade to a supported version of dbt or risk service disruption.** +> **Starting with v1.0, all subsequent minor versions are available in dbt Cloud. Versions are actively supported, with patches and bug fixes, for 1 year after their initial release. At the end of the 1-year window, we encourage all users to upgrade to a newer version for better ongoing maintenance and support.** -We will continue to update this table so that customers know when we plan to stop supporting different versions of Core in dbt Cloud. +We provide different support levels for different versions, which may include new features, bug fixes, or security patches: - + +We'll continue to update the following release table so that users know when we plan to stop supporting different versions of Core in dbt Cloud. -:::warning ⚠️ v0.X Non-Supported Period - Accounts had until the end of June 2022 to upgrade to dbt 1.0 or later. Pre-dbt 1.0 versions will no longer receive patch fixes, and our support team will no longer assist with dbt version specific help on non-supported versions of dbt. Additionally, jobs running dbt versions prior to 1.0 may experience service disruptions before the end of the year and may be removed from the dbt Cloud context by year end. You will receive additional notification before any planned disruption to your production jobs. -::: + -Starting in v1.0, dbt Cloud will ensure that you're always using the latest compatible patch release of `dbt-core` and plugins, including all the latest fixes. You may also choose to try prereleases of those patch releases before they are generally available. +Starting with v1.0, dbt Cloud will ensure that you're always using the latest compatible patch release of `dbt-core` and plugins, including all the latest fixes. You may also choose to try prereleases of those patch releases before they are generally available. -For more on version support and future releases, see [Understanding dbt Core versions](core-versions). - -#### What will actually happen on the end of support date? - -1 year post a minor version release of v1.X, we will try to run our users' projects on the latest release of dbt if they have not already upgraded their projects themselves. In a post dbt v1.0 world, there won't be breaking changes between minor versions of dbt, so we might be reasonably successful at upgrading our users' versions for them. However, our strong preference is for accounts to try to manage the upgrade process themselves which is a more cautious way to prevent failures to their production pipelines. We will give accounts consistent communication that they're hitting the end of their supported window, so they can plan accordingly. - -#### What should you be doing today? - -You should **upgrade to v1.0 as soon as you can** - and we recommend that you proceed **slowly and steadily**. - -Why? Because attempting to upgrade 6 minor versions at one time (v0.15.0 —> v0.21.0) implies 6x the potential for breaking changes, versus upgrading a single minor version. - -Refactoring code is much easier when you're updating a well-defined, constrained surface area. Doing things incrementally is the way to go. - -Additionally upgrading to more recent versions of dbt Core will enable better performance and more features in dbt Cloud. Below is a compatability matrix between dbt versions and dbt Cloud features. Hopefully this provides more motivation to always update your environments and jobs to run the latest version of dbt. - -| dbt Cloud Feature | dbt Core Version Needed | -| ------------- | -------------- | -| [Environment variable secret scrubbing](/docs/build/environment-variables#handling-secrets)| v1.0+ | -| DAG in the IDE | v0.20.0+| -| [Metadata API](/docs/dbt-cloud-apis/metadata-api) |v0.19.0+| -| [Dashboard status tiles](/docs/deploy/dashboard-status-tiles) | v0.19.0+ | -| [Slim CI](/docs/deploy/cloud-ci-job) | v0.18.0+ | +For more on version support and future releases, see [Understanding dbt Core versions](/docs/dbt-versions/core). #### Need help upgrading? @@ -78,9 +51,9 @@ If you want more advice on how to upgrade your dbt projects, check out our [migr ## Upgrading legacy versions under 1.0 -This Q&A guide should help you figure out what changes you might need to make to successfully upgrade your version of dbt Core in dbt Cloud. As a reminder, we recommend everyone upgrade to the most recent version of dbt, as we will not support all versions of dbt in Cloud indefinitely. We document which versions of dbt Core we support [here](docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version#supported-versions). +You can use the following sections to successfully upgrade your version of dbt Core in dbt Cloud. We recommend everyone upgrade to the most recent version of dbt Core, as new versions contain enhancements, bug fixes, and updated security features. We document which [versions of dbt Core are currently supported](/docs/dbt-versions/upgrade-core-in-cloud#supported-versions). -There aren't many breaking changes between minor versions, and it may be the case that you don't need to change any code to upgrade to a newer version of dbt in dbt Cloud. There are only breaking changes between minor versions of dbt before dbt 1.0. Minor releases starting with dbt 1.0, do not have breaking code changes. If there are no code changes needed, all you have to do is [change the settings](/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version.md) in your environment or job to run a more recent version of dbt. +There aren't many breaking changes between minor versions, and it may be the case that you don't need to change any code to upgrade to a newer version of dbt in dbt Cloud. There are only breaking changes between minor versions of dbt before dbt 1.0. Minor releases starting with dbt 1.0, do not have breaking code changes. If there are no code changes needed, all you have to do is [change the settings](/docs/dbt-versions/upgrade-core-in-cloud#upgrading-to-the-latest-version-of-dbt-in-cloud) in your environment or job to run a more recent version of dbt. #### Changes between minor versions of dbt that will affect your project @@ -302,7 +275,7 @@ If you believe your project might be affected, read more details in the migratio #### Testing your changes before upgrading -Once you have an idea about what code changes you'll need to make, you can start implementing them. We recommend that you create a separate dbt project, **Upgrade Project**, to test your changes before making them live in your main dbt project. In your **Upgrade Project**, connect to the same repository that you use for your main dbt project, but this time, set the development environment [settings](docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version) to run the latest version of dbt Core. Next check out a branch `dbt-version-upgrade`, make the appropriate updates to your project (if needed), and see if your dbt project compiles and runs with the new version of dbt in the IDE. If jumping directly to the latest version of dbt is too far of a leap for your project, try iteratively getting your project to work on each successive minor version. There are years of development and a handful of breaking changes between two distant versions of dbt (e.g. 0.14 --> 1.0). There are far fewer between two subsequent versions of dbt, which is why upgrading regularly is important. +Once you know what code changes you'll need to make, you can start implementing them. We recommend you create a separate dbt project, **Upgrade Project**, to test your changes before making them live in your main dbt project. In your **Upgrade Project**, connect to the same repository you use for your production project. This time, set the development environment [settings](/docs/dbt-versions/upgrade-core-in-cloud) to run the latest version of dbt Core. Next, check out a branch `dbt-version-upgrade`, make the appropriate updates to your project, and verify your dbt project compiles and runs with the new version in the IDE. If upgrading directly to the latest version results in too many issues, try testing your project iteratively on successive minor versions. There are years of development and a few breaking changes between distant versions of dbt Core (for example, 0.14 --> 1.0). The likelihood of experiencing problems upgrading between successive minor versions is much lower, which is why upgrading regularly is recommended. Once you have your project compiling and running on the latest version of dbt in the development environment for your `dbt-version-upgrade` branch, try replicating one of your production jobs to run off your branch's code. You can do this by creating a new deployment environment for testing, setting the custom branch to 'ON' and referencing your `dbt-version-upgrade` branch. You'll also need to set the dbt version in this environment to the latest dbt Core version. diff --git a/website/docs/docs/deploy/airgapped.md b/website/docs/docs/deploy/airgapped.md index e04154e2c61..a08370fef8c 100644 --- a/website/docs/docs/deploy/airgapped.md +++ b/website/docs/docs/deploy/airgapped.md @@ -10,10 +10,10 @@ If you’re interested in learning more about airgapped deployments for dbt Clou ::: -The airgapped deployment is similiar to an on-premise installation in that the dbt Cloud instance will live in your network, and is subject to your security procedures, technologies, and controls. However, on-premises still has a number of [external network dependencies](/docs/dbt-cloud/on-premises/dependencies). An airgapped install allows you to run dbt Cloud without any of those external network dependencies and is ideal for organizations who have strict InfoSec rules around installing software from the cloud. +The airgapped deployment is similar to an on-premise installation in that the dbt Cloud instance will live in your network, and is subject to your security procedures, technologies, and controls. An airgapped install allows you to run dbt Cloud without any external network dependencies and is ideal for organizations that have strict rules around installing software from the cloud. The installation process for airgapped is a bit different. Instead of downloading and installing images during installation time, you will download all of the necessary configuration and Docker images before starting the installation process, and manage uploading these images yourself. This means that you can remove all external network dependencies and run this application in a very secure environment. For more information about the dbt Cloud Airgapped deployment see the below. -- [Customer Managed Network Architecture](/docs/dbt-cloud/deployments/deployment-architecture#customer-managed-general-network-architecture) +- [Customer Managed Network Architecture](/docs/cloud/about-cloud/architecture) diff --git a/website/docs/docs/deploy/architecture.md b/website/docs/docs/deploy/architecture.md deleted file mode 100644 index 2db65c53ea2..00000000000 --- a/website/docs/docs/deploy/architecture.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: Architecture -id: architecture ---- - -This page is intended to help both practitioners seeking to understand the architecture and data flow of the hosted dbt Cloud product. - -### Application Data Flows - -The dbt Cloud application is comprised of a set of static components, as well as a set of dynamic components. The static components are constantly running to serve highly available dbt Cloud functionality, for example, the dbt Cloud web application. The dynamic components are created just-in-time to fill background jobs or a user request to use the IDE. These components are enumerated below. - - - -#### Static Application Components - -- **api gateway**: The API gateway is the entrypoint for all client requests to dbt Cloud. The api gateway serves static content, and contains logic for routing requests within the dbt Cloud application. -- **app**: The app is the dbt Cloud application server. It consists of a Django application capable of serving dbt Cloud REST API requests. -- **scheduler**: The scheduler is a continuously running process that orchestrates background jobs in dbt Cloud. It consists of two components: the scheduler container which provisions dynamic resources just-in-time, and the background cleanup container which performs maintenance tasks on the dbt Cloud database, including flushing logs from dbt runs out into the object store. - -#### Dynamic Application Components - -- **dbt run**: A "run" in dbt Cloud represents a series of background invocations of dbt that are triggered either on a cron scheduler, manually by a user, or via dbt Cloud's API. -- **dbt develop**: This is a server capable of serving dbt IDE requests for a single user. dbt Cloud will create one of these for each user that is actively using the dbt IDE. - -#### Application Critical Components - -In addition to the application components, there are a few critical dependencies of the application components that are required in order for the dbt Cloud application to function. - -- **PostgreSQL database**: dbt Cloud uses a PostgreSQL database as its backend. This can be a cloud-hosted database, for example, AWS RDS, Azure Database, Google Cloud SQL (recommended for production deployments); or, it can be embedded into the dbt Cloud Kubernetes appliance (not recommended for production deployments). -- **Object Storage**: dbt Cloud requires an S3-compatible Object Storage system for persisting run logs and artifacts. -- **Storage Volumes**: dbt Cloud requires a Kubernetes storage provider capable of creating dynamic persistent volumes that can be mounted to multiple containers in R/W mode. - -### Data Warehouse Interaction - -dbt Cloud's primary role is as a data processor, not a data store. The dbt Cloud application enables users to dispatch SQL to the warehouse for transformation purposes. However, it is possible for users to dispatch SQL that returns customer data into the dbt Cloud application. This data is never persisted and will only exist in memory on the instance in question. In order to properly lock down customer data, it is critical that proper permissioning is applied to prevent improper access or storage of sensitive data. - -### Deployment Architecture - -The following two sections describe the network architectures for dbt Cloud deployments. Hosted deployments leverage AWS infrastructure. - -#### Hosted Network Architecture - -The following diagram shows the network architecture for the hosted _Multi Tenant_ and _Single Tenant_ deployment types. While many of the specifications differ between the Multi Tenant and Single Tenant offerings the basic types of components illustrated below are mostly the same. Read below for more information on each of the components and how they might differ between the two deployment models. - - - -- **VPC**: In both hosted deployments, the dbt Cloud application infrastructure lives in an [AWS VPC](https://aws.amazon.com/vpc/) managed by dbt Labs. One of the key differences between the Production and Single Tenant deployment is that the Single Tenant deployment provides a dedicated VPC for a single customer. -- **EKS**: Hosted environments leverage [AWS Elastic Kubernetes Service](https://aws.amazon.com/eks/) to manage dbt Cloud application resources. EKS provides a high degree of reliability and scalability for the dbt Cloud application. -- **CLB**: One or more [AWS Classic Load Balancers](https://aws.amazon.com/elasticloadbalancing/) living in a public subnet are leveraged in the hosted deployment environments to distribute incoming traffic across multple EC2 instances in the EKS cluster. -- **EC2**: The hosted dbt Cloud deployments leverage a cluster of [AWS EC2](https://aws.amazon.com/ec2/) worker nodes to run the dbt Cloud application. -- **EBS**: In order to store application data, dbt Cloud leverages [AWS Elastic Block Store](https://aws.amazon.com/ebs/) mounted to the EC2 instances described above. -- **EFS**: An [AWS Elastic File System](https://aws.amazon.com/efs/) is provisioned for hosted deployments to store and manage local files from the dbt Cloud IDE. -- **S3**: [AWS Simple Storage Service (S3)](https://aws.amazon.com/s3/) is used to store dbt Cloud application logs and artifacts (such as those generated from dbt job runs). -- **RDS**: The hosted dbt Cloud application leverages [AWS Postgres RDS](https://aws.amazon.com/rds/postgresql/) to store application information such as accounts, users, environments, etc. Note that as explained in the [Data Warehouse Interaction](#data-warehouse-interaction) section above, no data from an associated warehouse is ever stored in this database. diff --git a/website/docs/docs/dbt-cloud/using-dbt-cloud/artifacts.md b/website/docs/docs/deploy/artifacts.md similarity index 80% rename from website/docs/docs/dbt-cloud/using-dbt-cloud/artifacts.md rename to website/docs/docs/deploy/artifacts.md index 40d5a2560ff..9b3ae71e79c 100644 --- a/website/docs/docs/dbt-cloud/using-dbt-cloud/artifacts.md +++ b/website/docs/docs/deploy/artifacts.md @@ -1,10 +1,10 @@ --- -title: "Building and configuring artifacts" +title: "Artifacts" id: "artifacts" description: "Use artifacts to power your automated docs site and source freshness data." --- -When running dbt jobs, dbt Cloud generates and saves *artifacts*. You can use these artifacts, like `manifest.json`, `catalog.json`, and `sources.json` to power different aspects of dbt Cloud, namely: [dbt Docs](documentation) and [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness). +When running dbt jobs, dbt Cloud generates and saves *artifacts*. You can use these artifacts, like `manifest.json`, `catalog.json`, and `sources.json` to power different aspects of dbt Cloud, namely: [dbt Docs](/docs/collaborate/documentation) and [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness). ## Create dbt Cloud Artifacts @@ -20,7 +20,10 @@ When you add a production job to a project, dbt Cloud updates the content and pr ### Documentation -When set up, dbt Cloud updates the **Documentation** link in the upper left so it links to documentation for this job. This link always points to the latest version of the documentation for your account! +When set up, dbt Cloud updates the **Documentation** link in the header tab so it links to documentation for this job. This link always directs you to the latest version of the documentation for your project. + +Note that both the job's commands and the docs generate step (triggered by the **Generate docs on run** checkbox) must succeed during the job invocation for the project-level documentation to be populated or updated. + diff --git a/website/docs/docs/deploy/ci-jobs.md b/website/docs/docs/deploy/ci-jobs.md new file mode 100644 index 00000000000..d10bc780fc2 --- /dev/null +++ b/website/docs/docs/deploy/ci-jobs.md @@ -0,0 +1,164 @@ +--- +title: "Continuous integration jobs in dbt Cloud" +sidebar_label: "CI jobs" +description: "Learn how to create and set up CI checks to test code changes before deploying to production." +--- + +You can set up [continuous integration](/docs/deploy/continuous-integration) (CI) jobs to run when someone opens a new pull request (PR) in your dbt Git repository. By running and testing only _modified_ models, dbt Cloud ensures these jobs are as efficient and resource conscientious as possible on your data platform. + + +## Set up CI jobs {#set-up-ci-jobs} + +dbt Labs recommends that you create your CI job in a dedicated dbt Cloud [deployment environment](/docs/deploy/deploy-environments#create-a-deployment-environment) that's connected to a staging database. Having a separate environment dedicated for CI will provide better isolation between your temporary CI schema builds and your production data builds. Additionally, sometimes teams need their CI jobs to be triggered when a PR is made to a branch other than main. If your team maintains a staging branch as part of your release process, having a separate environment will allow you to set a [custom branch](/faqs/environments/custom-branch-settings) and, accordingly, the CI job in that dedicated environment will be triggered only when PRs are made to the specified custom branch. To learn more, refer to [Get started with CI tests](/guides/orchestration/set-up-ci/overview). + +### Prerequisites +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- You must be connected using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops). + - If you’re using GitLab, you must use a paid or self-hosted account which includes support for GitLab webhooks. + - If you previously configured your dbt project by providing a generic git URL that clones using SSH, you must reconfigure the project to connect through dbt Cloud's native integration. + + +To make CI job creation easier, many options on the **CI job** page are set to default values that dbt Labs recommends that you use. If you don't want to use the defaults, you can change them. + +1. On your deployment environment page, click **Create Job** > **Continuous Integration Job** to create a new CI job. + +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for this CI job. + - **Environment** — By default, it’s set to the environment you created the CI job from. + - **Triggered by pull requests** — By default, it’s enabled. Every time a developer opens up a pull request or pushes a commit to an existing pull request, this job will get triggered to run. + - **Run on Draft Pull Request** — Enable this option if you want to also trigger the job to run every time a developer opens up a draft pull request or pushes a commit to that draft pull request. + +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build --select state:modified+` command. This informs dbt Cloud to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred environment selected to compare state to. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when this job runs. + - **Compare changes against an environment (Deferral)** — By default, it’s set to the **Production** environment if you created one. This option allows dbt Cloud to check the state of the code in the PR against the code running in the deferred environment, so as to only check the modified code, instead of building the full table or the entire DAG. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ [deploy jobs](/docs/deploy/deploy-jobs) that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this job runs. This option is disabled by default since most teams do not want to test doc generation on every CI check. + + + +4. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when this CI job runs. You can specify that a CI job is running in a _Staging_ or _CI_ environment by setting an environment variable and modifying your project code to behave differently, depending on the context. It's common for teams to process only a subset of data for CI runs, using environment variables to branch logic in their dbt project code. + - **Target Name** — Define the [target name](/docs/build/custom-target-names). Similar to **Environment Variables**, this option lets you customize the behavior of the project. You can use this option to specify that a CI job is running in a _Staging_ or _CI_ environment by setting the target name and modifying your project code to behave differently, depending on the context. + - **Run Timeout** — Cancel this CI job if the run time exceeds the timeout value. You can use this option to help ensure that a CI check doesn't consume too much of your warehouse resources. + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running this CI job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. + + + + +## Trigger a CI job with the API + +If you're not using dbt Cloud’s native Git integration with [GitHub](/docs/cloud/git/connect-github), [GitLab](/docs/cloud/git/connect-gitlab), or [Azure DevOps](/docs/cloud/git/connect-azure-devops), you can use the [Administrative API](/docs/dbt-cloud-apis/admin-cloud-api) to trigger a CI job to run. However, dbt Cloud will not automatically delete the temporary schema for you. This is because automatic deletion relies on incoming webhooks from Git providers, which is only available through the native integrations. + +### Prerequisites + +- You have a dbt Cloud account. +- For the [Concurrent CI checks](/docs/deploy/continuous-integration#concurrent-ci-checks) and [Smart cancellation of stale builds](/docs/deploy/continuous-integration#smart-cancellation) features, your dbt Cloud account must be on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). + + +1. Set up a CI job with the [Create Job](/dbt-cloud/api-v2#/operations/Create%20Job) API endpoint using `"job_type": ci` or from the [dbt Cloud UI](#set-up-ci-jobs). +1. Call the [Trigger Job Run](/dbt-cloud/api-v2#/operations/Trigger%20Job%20Run) API endpoint to trigger the CI job. You must include these fields to the payload: + - Provide the pull request (PR) ID with one of these fields, even if you're using a different Git provider (like Bitbucket). This can make your code less human-readable but it will _not_ affect dbt functionality. + + - `github_pull_request_id` + - `gitlab_merge_request_id` + - `azure_devops_pull_request_id`  + - Provide the `git_sha` or `git_branch` to target the correct commit or branch to run the job against. + +## Example pull requests + +The green checkmark means the dbt build and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. + +### GitHub pull request example + + + +### GitLab pull request example + + + +### Azure DevOps pull request example + + + + +## Troubleshooting + +If you're experiencing any issues, review some of the common questions and answers below. + +
          + Temporary schemas aren't dropping +
          +
          If your temporary schemas aren't dropping after a PR merges or closes, this typically indicates one of these issues: +
            +
          • You have overridden the generate_schema_name macro and it isn't using dbt_cloud_pr_ as the prefix.



            To resolve this, change your macro so that the temporary PR schema name contains the required prefix. For example: +



            + ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing).

            + ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

            +
          • +
            +
          • + A macro is creating a schema but there are no dbt models writing to that schema. dbt Cloud doesn't drop temporary schemas that weren't written to as a result of running a dbt model. +
          • +
          +
          +
          +
          +
          + Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps +
          +
          If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud CI.



          + First, make sure you have the native GitHub authentication, native GitLab authentication, or native Azure DevOps authentication set up depending on which git provider you use. After you have gone through those steps, go to Account Settings, select Projects and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link.



          + + Once you're in the repository page, select Edit and then Disconnect Repository at the bottom.

          + +

          + Confirm that you'd like to disconnect your repository. You should then see a new Configure a repository link in your old repository's place. Click through to the configuration page:

          + +

          + + Select the GitHub, GitLab, or AzureDevOps tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job.
          +
          +
          +
          + Error messages that refer to schemas from previous PRs +
          +
          If you receive a schema-related error message referencing a previous PR, this is usually an indicator that you are not using a production job for your deferral and are instead using self. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the CI job for the current PR is kicked off.



          + + To fix this issue, select a production job run to defer to instead of self. +
          +
          +
          +
          + Production job runs failing at the Clone Git Repository step +
          +
          dbt Cloud can only check out commits that belong to the original repository. dbt Cloud cannot checkout commits that belong to a fork of that repository.



          + + If you receive the following error message at the Clone Git Repository step of your job run:

          + + Error message:

          + Cloning into '/tmp/jobs/123456/target'...

          + Successfully cloned repository.

          + Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312...

          + Failed to checkout to specified revision.

          + git checkout e845be54e6dc72342d5a8f814c8b3316ee220312

          + fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312

          +




          + + Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project.
          +
          +
          +
          + CI job not triggering for Virtual Private dbt users +
          +
          To trigger jobs on dbt Cloud using the API, your Git provider needs to connect to your dbt Cloud account.



          + + If you're on a Virtual Private dbt Enterprise plan using security features like ingress PrivateLink or IP Allowlisting, registering CI hooks may not be available and can cause the job to fail silently.
          +
          +
          diff --git a/website/docs/docs/deploy/cloud-ci-job.md b/website/docs/docs/deploy/cloud-ci-job.md deleted file mode 100644 index 8b4afe0a544..00000000000 --- a/website/docs/docs/deploy/cloud-ci-job.md +++ /dev/null @@ -1,202 +0,0 @@ ---- -title: "dbt Cloud CI job" -id: "cloud-ci-job" -description: "You can enable continuous integration (CI) to test every single change prior to deploying the code to production just like in a software development workflow." ---- - -## Overview - -dbt Cloud makes it easy to test every single code change you make prior to deploying that new logic into production. Once you've connected your [GitHub account](/docs/collaborate/git/connect-github), [GitLab account](/docs/collaborate/git/connect-gitlab), or [Azure DevOps account](/docs/collaborate/git/connect-azure-devops), you can configure jobs to run when new pull requests are opened against your dbt repo. - -dbt Cloud will build the models affected by the new pull request code change in a temp schema, which acts as a quasi-staging environment, and will also run the tests that you've written for these models as a check. When the continuous integration (CI) job completes, the run status will be shown directly in the pull request. This makes it possible to deploy new code to production with confidence. - -:::info Draft Pull Requests - -Jobs will _not_ be triggered by draft pull requests. If you would like jobs to run on each new commit, please mark your pull request as **Ready for review**. - -::: - -:::info GitLab Compatibility - -GitLab Webhooks are available to only GitLab users who have a paid or self-hosted GitLab account. - -::: - -:::info Common Errors -If you previously configured your dbt project by providing a generic git URL that clones using SSH, you need to [reconfigure the project](/docs/deploy/cloud-ci-job#reconnecting-your-dbt-project-to-use-dbt-clouds-native-integration-with-github-gitlab-or-azure-devops) to connect through dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps instead. -::: - -## Understanding dbt Cloud Slim CI -When a [dbt Cloud CI job is set up](/docs/deploy/cloud-ci-job#configuring-a-dbt-cloud-ci-job), dbt Cloud will listen for webhooks from GitHub, GitLab, or Azure DevOps indicating that a new PR has been opened or updated with new commits. When one of these webhooks is received, dbt Cloud will enqueue a new run of the CI job. Crucially, this run will build into a temporary schema using the prefix `dbt_cloud_pr_`. This schema isolation acts as a quasi-staging environment, so that you can see the builds resulting from the code associated with the PR's commit sha. The unique schema name can be found in the run details for the given run, as shown below. - - - -After completing the dbt run, dbt Cloud will update the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message will state whether the models and tests ran successfully or not. You can enable a setting in your git provider that makes "successful pull request checks" a requirement to merge code. And finally, once the pull request is closed or merged, dbt Cloud will delete the temporary schema from your . - -### GitHub pull request example - -The green checkmark means the dbt builds and tests were successful. The *Details* link shown here will navigate you to the relevant CI run in dbt Cloud. - - -### GitLab pull request example - -The green checkmark means the dbt builds and tests were successful. Clicking the dbt Cloud pop up will navigate you to the relevant CI run in dbt Cloud. - - -### Azure DevOps pull request example - -The green checkmark means the dbt builds and tests were successful. Clicking on the dbt Cloud section navigates you to the relevant CI run in dbt Cloud. - - -## Configuring a dbt Cloud CI job - -Setting up a CI job is very similiar to setting up a normal production job that runs on a schedule; however, a CI job has some noteable differences. - -There are a few components that define a Slim CI job. -- The Slim CI job must defer to a production job. -- The Slim CI job commands need to have a `state:modified+` selector to build only new or changed models and their downstream dependents. Importantly, state comparison can only happen when there is a deferred job selected to compare state to. -- The Slim CI job must be triggered by pull request. - -#### Deferral and State Comparison - -When creating a job in dbt Cloud, you can set your execution settings to defer to a previous run state. Use the dropdown menu to select which *production* job you want to defer to. - - - -When a job is selected, dbt Cloud will look at the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of new and modified resources. - -In your job commands, you can signal to dbt to run only on these modified resources and their children by including the `state:modified+` argument. - -As example: - -``` -dbt build --select state:modified+ -``` - -Because dbt Cloud manages deferral and state environment variables, there is no need to specify `--defer` or `--state` flags. **Note:** Both jobs need to be running dbt v0.18.0 or later. - - -To learn more about state comparison and deferral in dbt, read the docs on [state](understanding-state). - -#### Using a webhook trigger - -In the **Triggers** section of the jobs settings, switch to the **Webhooks** tab, and then check the box next to **Run on Pull Requests?** as shown below. - - - -This tells dbt Cloud to run the job whenever a pull request or commit is made, rather than on a schedule. Be sure to turn the schedule of the job off if you don't want it to also run on a time-based cadence. - - - -## Fresh Rebuilds - -As an extension of the Slim CI feature, dbt Cloud can rerun and retest only the things that are fresher compared to a previous run. - - - -Only supported by v1.1 or newer. - - - - - -Only supported by v1.1 or newer. - -:::caution Experimental functionality -The `source_status` selection is experimental and subject to change. During this time, ongoing improvements may limit this feature’s availability and cause breaking changes to its functionality. -::: - -When a job is selected, dbt Cloud will surface the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of fresh sources. In your job commands, you can signal to dbt to run and test only on these fresher sources and their children by including the `source_status:fresher+` argument. This requires both previous and current state to have the `sources.json` artifact be available. Or plainly said, both job states need to run `dbt source freshness`. - -As example: -```bash -# Command step order -dbt source freshness -dbt build --select source_status:fresher+ -``` - - -More example commands in [Pro-tips for workflows](/guides/legacy/best-practices.md#pro-tips-for-workflows). - -## Limitations - -If your temporary PR schemas aren't dropping after a merge or close of the PR, it's likely due to the below scenarios. Open and review the toggles below for recommendations on how to resolve this: - -
          - You used dbt Cloud environment variables in your connection settings page -
          -
          To resolve this, remove environment variables in your connections settings.
          -
          -
          -
          - You have an empty/blank default schema -
          -
          To change this, edit and fill in your default schema.
          -
          -
          -
          - You have overridden the generate_schema_name macro -
          -
          To resolve this, change your macro so that the temporary PR schema name contains the default prefix and review the guidance below: -

          - • ✅ Temporary PR schema name contains the prefix dbt_cloud_pr_ (like dbt_cloud_pr_123_456_marketing)

          - • ❌ Temporary PR schema name doesn't contain the prefix dbt_cloud_pr_ (like marketing).

          -
          -
          -
          -
          - You have overridden the generate_database_name macro -
          -
          If you assume that the project's default connection is to a database named analytics, review the guidance below to resolve this: -

          - • ✅ Database remains the same as the connection default (like analytics)

          - • ❌ Database has changed from the default connection (like dev).

          -
          -
          -
          - - -Make the necessary changes to your project and double-check if the temporary PR schemas drop after a merge or close of the PR. - -## Troubleshooting - -### Reconnecting your dbt project to use dbt Cloud's native integration with GitHub, GitLab, or Azure DevOps - -If your dbt project relies the generic git clone method that clones using SSH and deploy keys to connect to your dbt repo, you need to disconnect your repo and reconnect it using the native GitHub, GitLab, or Azure DevOps integration in order to enable dbt Cloud Slim CI. - -First, make sure you have the [native GitHub authentication](/docs/collaborate/git/connect-github), [native GitLab authentication](/docs/collaborate/git/connect-gitlab), or [native Azure DevOps authentication](/docs/collaborate/git/connect-azure-devops) set up depending on which git provider you use. After you have gone through those steps, head to **Account Settings**, select **Projects** and click on the project you'd like to reconnect through native GitHub, GitLab, or Azure DevOps auth. Then click on the repository link. - -Once you're in the repository page, click **Edit** and then click **Disconnect Repository** at the bottom. - - -Confirm that you'd like to disconnect your repository. You should then see a new **Configure a repository** link in your old repository's place. Click through to the configuration page: - - - -Select the **GitHub**, **GitLab**, or **AzureDevOps** tab and reselect your repository. That should complete the setup of the project and enable you to set up a dbt Cloud CI job. - -### Error messages that refer to schemas from previous PRs - -If you receive a schema-related error message referencing a *previous* PR, this is usually an indicator that you are not using a production job for your deferral and are instead using *self*. If the prior PR has already been merged, the prior PR's schema may have been dropped by the time the Slim CI job for the current PR is kicked off. - -To fix this issue, select a production job run to defer to instead of self. - - -### Production job runs failing at the **Clone Git Repository** step - -dbt Cloud can only checkout commits that belong to the original repository. dbt Cloud _cannot_ checkout commits that belong to a fork of that repository. - -If you receive the following error message at the **Clone Git Repository** step of your job run: - -``` -Error message: -Cloning into '/tmp/jobs/123456/target'... -Successfully cloned repository. -Checking out to e845be54e6dc72342d5a8f814c8b3316ee220312... -Failed to checkout to specified revision. -git checkout e845be54e6dc72342d5a8f814c8b3316ee220312 -fatal: reference is not a tree: e845be54e6dc72342d5a8f814c8b3316ee220312 -``` - -Double-check that your PR isn't trying to merge using a commit that belongs to a fork of the repository attached to your dbt project. diff --git a/website/docs/docs/deploy/continuous-integration.md b/website/docs/docs/deploy/continuous-integration.md new file mode 100644 index 00000000000..0f87965aada --- /dev/null +++ b/website/docs/docs/deploy/continuous-integration.md @@ -0,0 +1,52 @@ +--- +title: "Continuous integration in dbt Cloud" +sidebar_label: "Continuous integration" +description: "You can set up continuous integration (CI) checks to test every single change prior to deploying the code to production just like in a software development workflow." +--- + +To implement a continuous integration (CI) workflow in dbt Cloud, you can set up automation that tests code changes by running [CI jobs](/docs/deploy/ci-jobs) before merging to production. dbt Cloud tracks the state of what’s running in your production environment so, when you run a CI job, only the modified data assets in your pull request (PR) and their downstream dependencies are built and tested in a staging schema. You can also view the status of the CI checks (tests) directly from within the PR; this information is posted to your Git provider as soon as a CI job completes. Additionally, you can enable settings in your Git provider that allow PRs only with successful CI checks be approved for merging. + + + +Using CI helps: + +- Provide increased confidence and assurances that project changes will work as expected in production. +- Reduce the time it takes to push code changes to production, through build and test automation, leading to better business outcomes. +- Allow organizations to make code changes in a standardized and governed way that ensure code quality without sacrificing speed. + +## How CI works + +When you [set up CI jobs](/docs/deploy/ci-jobs#set-up-ci-jobs), dbt Cloud listens for webhooks from your Git provider indicating that a new PR has been opened or updated with new commits. When dbt Cloud receives one of these webhooks, it enqueues a new run of the CI job. + +dbt Cloud builds and tests the models affected by the code change in a temporary schema, unique to the PR. This process ensures that the code builds without error and that it matches the expectations as defined by the project's dbt tests. The unique schema name follows the naming convention `dbt_cloud_pr__` (for example, `dbt_cloud_pr_1862_1704`) and can be found in the run details for the given run, as shown in the following image: + + + +When the CI run completes, you can view the run status directly from within the pull request. dbt Cloud updates the pull request in GitHub, GitLab, or Azure DevOps with a status message indicating the results of the run. The status message states whether the models and tests ran successfully or not. + +dbt Cloud deletes the temporary schema from your  when you close or merge the pull request. If your project has schema customization using the [generate_schema_name](/docs/build/custom-schemas#how-does-dbt-generate-a-models-schema-name) macro, dbt Cloud might not drop the temporary schema from your data warehouse. For more information, refer to [Troubleshooting](/docs/deploy/ci-jobs#troubleshooting). + +## Differences between CI jobs and other deployment jobs + +The [dbt Cloud scheduler](/docs/deploy/job-scheduler) executes CI jobs differently from other deployment jobs in these important ways: + +- **Concurrent CI checks** — CI runs triggered by the same dbt Cloud CI job execute concurrently (in parallel), when appropriate +- **Smart cancellation of stale builds** — Automatically cancels stale, in-flight CI runs when there are new commits to the PR +- **Run slot treatment** — CI runs don't consume a run slot + +### Concurrent CI checks + +When you have teammates collaborating on the same dbt project creating pull requests on the same dbt repository, the same CI job will get triggered. Since each run builds into a dedicated, temporary schema that’s tied to the pull request, dbt Cloud can safely execute CI runs _concurrently_ instead of _sequentially_ (differing from what is done with deployment dbt Cloud jobs). Because no one needs to wait for one CI run to finish before another one can start, with concurrent CI checks, your whole team can test and integrate dbt code faster. + +Below describes the conditions when CI checks are run concurrently and when they’re not: + +- CI runs with different PR numbers execute concurrently. +- CI runs with the _same_ PR number and _different_ commit SHAs execute serially because they’re building into the same schema. dbt Cloud will run the latest commit and cancel any older, stale commits. For details, refer to [Smart cancellation of stale builds](#smart-cancellation). +- CI runs with the same PR number and same commit SHA, originating from different dbt Cloud projects will execute jobs concurrently. This can happen when two CI jobs are set up in different dbt Cloud projects that share the same dbt repository. + +### Smart cancellation of stale builds {#smart-cancellation} + +When you push a new commit to a PR, dbt Cloud enqueues a new CI run for the latest commit and cancels any CI run that is (now) stale and still in flight. This can happen when you’re pushing new commits while a CI build is still in process and not yet done. By cancelling runs in a safe and deliberate way, dbt Cloud helps improve productivity and reduce data platform spend on wasteful CI runs. + + + diff --git a/website/docs/docs/deploy/dashboard-status-tiles.md b/website/docs/docs/deploy/dashboard-status-tiles.md index d6b4bc84a99..67aa1a93c33 100644 --- a/website/docs/docs/deploy/dashboard-status-tiles.md +++ b/website/docs/docs/deploy/dashboard-status-tiles.md @@ -4,8 +4,7 @@ id: "dashboard-status-tiles" description: "Embed Status Tiles in your dashboards to provide consumers with contextual information about the quality and freshness of data." --- -## Overview -In dbt Cloud, the [Metadata API](/docs/dbt-cloud-apis/metadata-api) can power Dashboard Status Tiles. A Dashboard Status Tile is placed on a dashboard (specifically: anywhere you can embed an iFrame) to give insight into the quality and freshness of the data feeding into that dashboard. This is done via dbt [exposures](/docs/build/exposures). +In dbt Cloud, the [Discovery API](/docs/dbt-cloud-apis/discovery-api) can power Dashboard Status Tiles. A Dashboard Status Tile is placed on a dashboard (specifically: anywhere you can embed an iFrame) to give insight into the quality and freshness of the data feeding into that dashboard. This is done via dbt [exposures](/docs/build/exposures). ## Functionality The dashboard status tile looks like this: @@ -23,7 +22,7 @@ First, be sure to enable [source freshness](/docs/deploy/source-freshness) in th In order to set up your dashboard status tile, here is what you need: -1. **Metadata Token.** You can learn how to set up a metadata only token [here](/docs/dbt-cloud-apis/service-tokens). +1. **Metadata Only token.** You can learn how to set up a Metadata-Only token [here](/docs/dbt-cloud-apis/service-tokens). 2. **Exposure name.** You can learn more about how to set up exposures [here](/docs/build/exposures). @@ -32,32 +31,72 @@ In order to set up your dashboard status tile, here is what you need: You can insert these three fields into the following iFrame, and then embed it **anywhere that you can embed an iFrame**: ``` - + ``` +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your account is hosted in the EMEA region, you would use the following iFrame code: + +``` + +``` + +::: + ## Embedding with BI tools The dashboard status tile should work anywhere you can embed an iFrame. But below are some tactical tips on how to integrate with common BI tools. ### Mode Mode allows you to directly [edit the HTML](https://mode.com/help/articles/report-layout-and-presentation/#html-editor) of any given report, where you can embed the iFrame. -Note that Mode has also built their own [integration](https://mode.com/get-dbt/) with the dbt Cloud Metadata API! +Note that Mode has also built its own [integration](https://mode.com/get-dbt/) with the dbt Cloud Discovery API! ### Looker -Looker does not allow you to directly embed HTML, and instead requires creating a [custom visualization](https://docs.looker.com/admin-options/platform/visualizations). One way to do this for admins is to: +Looker does not allow you to directly embed HTML and instead requires creating a [custom visualization](https://docs.looker.com/admin-options/platform/visualizations). One way to do this for admins is to: - Add a [new visualization](https://fishtown.looker.com/admin/visualizations) on the visualization page for Looker admins. You can use [this URL](https://metadata.cloud.getdbt.com/static/looker-viz.js) to configure a Looker visualization powered by the iFrame. It will look like this: - + - Once you have set up your custom visualization, you can use it on any dashboard! You can configure it with the exposure name, jobID, and token relevant to that dashboard. - + ### Tableau Tableau does not require you to embed an iFrame. You only need to use a Web Page object on your Tableau Dashboard and a URL in the following format: +``` +https://metadata.YOUR_ACCESS_URL/exposure-tile?name=&jobId=&token= +``` + +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your account is hosted in the North American region, you would use the following code: + ``` https://metadata.cloud.getdbt.com/exposure-tile?name=&jobId=&token= + +``` +::: + + + +### Sigma + +Sigma does not require you to embed an iFrame. Add a new embedded UI element in your Sigma Workbook in the following format: + +``` +https://metadata.YOUR_ACCESS_URL/exposure-tile?name=&jobId=&token= +``` + +:::tip Replace `YOUR_ACCESS_URL` with your region and plan's Access URL + +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. Replace `YOUR_ACCESS_URL` with the appropriate [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. For example, if your account is hosted in the APAC region, you would use the following code: + +``` +https://metadata.au.dbt.com/exposure-tile?name=&jobId=&token= + ``` +::: - + diff --git a/website/docs/docs/deploy/deploy-environments.md b/website/docs/docs/deploy/deploy-environments.md new file mode 100644 index 00000000000..bdcf36b7a30 --- /dev/null +++ b/website/docs/docs/deploy/deploy-environments.md @@ -0,0 +1,197 @@ +--- +title: "Deployment environments" +id: "deploy-environments" +description: "Learn about dbt Cloud's deployment environment to seamlessly schedule jobs or enable CI." +--- + +Deployment environments in dbt Cloud are crucial for deploying dbt jobs in production and using features or integrations that depend on dbt metadata or results. To execute dbt, environments determine the settings used during job runs, including: + +- The version of dbt Core that will be used to run your project +- The warehouse connection information (including the target database/schema settings) +- The version of your code to execute + +A dbt Cloud project can have multiple deployment environments, providing you the flexibility and customization to tailor the execution of dbt jobs. You can use deployment environments to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs), [enable continuous integration](/docs/deploy/continuous-integration), or more based on your specific needs or requirements. + +:::tip Learn how to manage dbt Cloud environments +To learn different approaches to managing dbt Cloud environments and recommendations for your organization's unique needs, read [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview). +::: + +This page reviews the different types of environments and how to configure your deployment environment in dbt Cloud. + +import CloudEnvInfo from '/snippets/_cloud-environments-info.md'; + + + +## Create a deployment environment + +To create a new dbt Cloud development environment, navigate to **Deploy** -> **Environments** and then click **Create Environment**. Select **Deployment** as the environment type. + + + +### Set as production environment (Beta) + +import ExpBeta from '/snippets/_explorer-beta-banner.md'; + + + + + +In dbt Cloud, each project can have one designated deployment environment, which serves as its production environment. This production environment is _essential_ for using features like dbt Explorer and cross-project references. It acts as the source of truth for the project's production state in dbt Cloud. + +### Semantic Layer + +For Semantic Layer-eligible customers, the next section of environment settings is the Semantic Layer configurations. [The Semantic Layer setup guide](/docs/use-dbt-semantic-layer/setup-sl) has the most up-to-date setup instructions! + +### Deployment connection + +:::info Warehouse Connections + + Warehouse connections are set at the Project level for dbt Cloud accounts, and each Project can have one connection (Snowflake account, Redshift host, Bigquery project, Databricks host, etc.). Some details of that connection (databases/schemas/etc.) can be overridden within this section of the dbt Cloud environment settings. + +::: + +This section determines the exact location in your warehouse dbt should target when building warehouse objects! This section will look a bit different depending on your warehouse provider. + + + + +
          + +This section will not appear if you are using Postgres, as all values are inferred from the project's connection. + +
          + +
          + +This section will not appear if you are using Redshift, as all values are inferred from the project's connection. + +
          + +
          + + + +#### Editable fields + +- **Role**: Snowflake role +- **Database**: Target database +- **Warehouse**: Snowflake warehouse + +
          + +
          + +This section will not appear if you are using Bigquery, as all values are inferred from the project's connection. + +
          + +
          + +This section will not appear if you are using Spark, as all values are inferred from the project's connection. + +
          + +
          + + + +#### Editable fields + +- **Catalog** (optional): [Unity Catalog namespace](/docs/core/connect-data-platform/databricks-setup) + +
          + +
          + + +### Deployment credentials + +This section allows you to determine the credentials that should be used when connecting to your warehouse. The authentication methods may differ depending on the warehouse and dbt Cloud tier you are on. + + + +
          + + + +#### Editable fields + +- **Username**: Postgres username to use (most likely a service account) +- **Password**: Postgres password for the listed user +- **Schema**: Target schema + +
          + +
          + + + +#### Editable fields + +- **Username**: Redshift username to use (most likely a service account) +- **Password**: Redshift password for the listed user +- **Schema**: Target schema + +
          + +
          + + + +#### Editable fields + +- **Auth Method**: This determines the way dbt connects to your warehouse + - One of: [**Username & Password**, **Key Pair**] +- If **Username & Password**: + - **Username**: username to use (most likely a service account) + - **Password**: password for the listed user +- If **Key Pair**: + - **Username**: username to use (most likely a service account) + - **Private Key**: value of the Private SSH Key (optional) + - **Private Key Passphrase**: value of the Private SSH Key Passphrase (optional, only if required) +- **Schema**: Target Schema for this environment + +
          + +
          + + + +#### Editable fields + +- **Dataset**: Target dataset + +
          + +
          + + + +#### Editable fields + +- **Token**: Access token +- **Schema**: Target schema + +
          + +
          + + + +#### Editable fields + +- **Token**: Access token +- **Schema**: Target schema + +
          + +
          + + +## Related docs + +- [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [CI jobs](/docs/deploy/continuous-integration) +- [Delete a job or environment in dbt Cloud](/faqs/Environments/delete-environment-job) + diff --git a/website/docs/docs/deploy/deploy-jobs.md b/website/docs/docs/deploy/deploy-jobs.md new file mode 100644 index 00000000000..e43020bf66e --- /dev/null +++ b/website/docs/docs/deploy/deploy-jobs.md @@ -0,0 +1,101 @@ +--- +title: "Deploy jobs" +description: "Learn how to create and schedule deploy jobs in dbt Cloud for the scheduler to run. When you run with dbt Cloud, you get built-in observability, logging, and alerting." +tags: [scheduler] +--- + +You can use deploy jobs to build production data assets. Deploy jobs make it easy to run dbt commands against a project in your cloud data platform, triggered either by schedule or events. Each job run in dbt Cloud will have an entry in the job's run history and a detailed run overview, which provides you with: + +- Job trigger type +- Commit SHA +- Environment name +- Sources and documentation info, if applicable +- Job run details, including run timing, [model timing data](#model-timing), and [artifacts](/docs/deploy/artifacts) +- Detailed run steps with logs and their run step statuses + +You can create a deploy job and configure it to run on [scheduled days and times](#schedule-days) or enter a [custom cron schedule](#custom-cron-schedules). + + +## Prerequisites + +- You must have a dbt Cloud account and [Developer seat license](/docs/cloud/manage-access/seats-and-users). If you don't, you can [sign up](https://www.getdbt.com/signup/) for a [free account](https://www.getdbt.com/pricing/). +- You must have a dbt project connected to a [data platform](/docs/cloud/connect-data-platform/about-connections). +- You must have [access permission](/docs/cloud/manage-access/about-user-access) to view, create, modify, or run jobs. +- You must set up a [deployment environment](/docs/deploy/deploy-environments). + +## Create and schedule jobs {#create-and-schedule-jobs} + +1. On your deployment environment page, click **Create Job** > **Deploy Job** to create a new deploy job. +2. Options in the **Job Description** section: + - **Job Name** — Specify the name for the deploy job. For example, `Daily build`. + - **Environment** — By default, it’s set to the deployment environment you created the deploy job from. +3. Options in the **Execution Settings** section: + - **Commands** — By default, it includes the `dbt build` command. Click **Add command** to add more [commands](/docs/deploy/job-commands) that you want to be invoked when the job runs. + - **Generate docs on run** — Enable this option if you want to [generate project docs](/docs/collaborate/build-and-view-your-docs) when this deploy job runs. + - **Run source freshness** — Enable this option to invoke the `dbt source freshness` command before running the deploy job. Refer to [Source freshness](/docs/deploy/source-freshness) for more details. +4. Options in the **Schedule** section: + - **Run on schedule** — Enable this option to run the deploy job on a set schedule. + - **Timing** — Specify whether to [schedule](#schedule-days) the deploy job using **Frequency** that runs the job at specific times of day, **Specific Intervals** that runs the job every specified number of hours, or **Cron Schedule** that runs the job specified using [cron syntax](#custom-cron-schedule). + - **Days of the Week** — By default, it’s set to every day when **Frequency** or **Specific Intervals** is chosen for **Timing**. + + + +5. (optional) Options in the **Advanced Settings** section: + - **Environment Variables** — Define [environment variables](/docs/build/environment-variables) to customize the behavior of your project when the deploy job runs. + - **Target Name** — Define the [target name](/docs/build/custom-target-names) to customize the behavior of your project when the deploy job runs. Environment variables and target names are often used interchangeably. + - **Run Timeout** — Cancel the deploy job if the run time exceeds the timeout value. + - **Compare changes against** — By default, it’s set to **No deferral**. Select either **Environment** or **This Job** to let dbt Cloud know what it should compare the changes against. + + :::info + Older versions of dbt Cloud only allow you to defer to a specific job instead of an environment. Deferral to a job compares state against the project code that was run in the deferred job's last successful run. While deferral to an environment is more efficient as dbt Cloud will compare against the project representation (which is stored in the `manifest.json`) of the last successful deploy job run that executed in the deferred environment. By considering _all_ deploy jobs that run in the deferred environment, dbt Cloud will get a more accurate, latest project representation state. + ::: + + - **dbt Version** — By default, it’s set to inherit the [dbt version](/docs/dbt-versions/core) from the environment. dbt Labs strongly recommends that you don't change the default setting. This option to change the version at the job level is useful only when you upgrade a project to the next dbt version; otherwise, mismatched versions between the environment and job can lead to confusing behavior. + - **Threads** — By default, it’s set to 4 [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads). Increase the thread count to increase model execution concurrency. + + + +### Schedule days + +To set your job's schedule, use the **Schedule Days** option to choose specific days of the week, and select customized hours or intervals. + +Under **Timing**, you can either use customizable hours for jobs that need to run frequently throughout the day or exact intervals for jobs that need to run at specific times: + +- **Every n hours** — Use this option to set how often your job runs, in hours. Enter a number between 1 and 23 to represent the interval between job runs. For example, if you set it to "every 2 hours", the job will run every 2 hours from midnight UTC. This option is useful if you need to run jobs multiple times per day at regular intervals. + +- **At exact intervals** — Use this option to set specific times when your job should run. You can enter a comma-separated list of hours (in UTC) when you want the job to run. For example, if you set it to `0,12,23,` the job will run at midnight, noon, and 11 PM UTC. This option is useful if you want your jobs to run at specific times of day and don't need them to run more frequently than once a day. + +:::info + +dbt Cloud uses [Coordinated Universal Time](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) (UTC) and does not account for translations to your specific timezone or take into consideration daylight savings time. For example: + +- 0 means 12am (midnight) UTC +- 12 means 12pm (afternoon) UTC +- 23 means 11pm UTC + +::: + +### Custom cron schedule + +To fully customize the scheduling of your job, choose the **Custom cron schedule** option and use the cron syntax. With this syntax, you can specify the minute, hour, day of the month, month, and day of the week, allowing you to set up complex schedules like running a job on the first Monday of each month. + + + + +Use tools such as [crontab.guru](https://crontab.guru/) to generate the correct cron syntax. This tool allows you to input cron snippets and returns their plain English translations. + +Refer to the following example snippets: + + +- `0 * * * *`: Every hour, at minute 0 +- `*/5 * * * *`: Every 5 minutes +- `5 4 * * *`: At exactly 4:05 AM UTC +- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) +- `0 0 */2 * *`: At midnight UTC every other day +- `0 0 * * 1`: At midnight UTC every Monday. + +## Related docs + +- [Artifacts](/docs/deploy/artifacts) +- [Continuous integration (CI) jobs](/docs/deploy/ci-jobs) +- [Webhooks](/docs/deploy/webhooks) diff --git a/website/docs/docs/deploy/deployment-overview.md b/website/docs/docs/deploy/deployment-overview.md index 2bfbaf93e8c..29934663544 100644 --- a/website/docs/docs/deploy/deployment-overview.md +++ b/website/docs/docs/deploy/deployment-overview.md @@ -1,74 +1,125 @@ --- -title: "About deployments" +title: "Deploy dbt" id: "deployments" +sidebar: "Use dbt Cloud's capabilities to seamlessly run a dbt job in production." +hide_table_of_contents: true +tags: ["scheduler"] +pagination_next: "docs/deploy/job-scheduler" +pagination_prev: null --- -Running dbt in production means setting up a system to run a _dbt job on a schedule_, rather than running dbt commands manually from the command line. Your production dbt jobs should create the tables and views that your business intelligence tools and end users query. Before continuing, make sure you understand dbt's approach to [managing environments](/docs/collaborate/environments). +Use dbt Cloud's capabilities to seamlessly run a dbt job in production or staging environments. Rather than run dbt commands manually from the command line, you can leverage the [dbt Cloud's in-app scheduling](/docs/deploy/job-scheduler) to automate how and when you execute dbt. -In addition to setting up a schedule, there are other considerations when setting up dbt to run in production: +dbt Cloud offers the easiest and most reliable way to run your dbt project in production. Effortlessly promote high quality code from development to production and build fresh data assets that your business intelligence tools and end users query to make business decisions. Deploying with dbt Cloud lets you: +- Keep production data fresh on a timely basis +- Ensure CI and production pipelines are efficient +- Identify the root cause of failures in deployment environments +- Maintain high-quality code and data in production +- Gain visibility into the health of deployment jobs, models, and tests -* The complexity involved in creating a new dbt job or editing an existing one. -* Setting up notifications if a step within your job returns an error code (for example, a model can't be built or a test fails). -* Accessing logs to help debug any issues. -* Pulling the latest version of your git repo before running dbt (continuous deployment). -* Running your dbt project before merging code into master (continuous integration). -* Allowing access for team members that need to collaborate on your dbt project. +Before continuing, make sure you understand dbt's approach to [deployment environments](/docs/deploy/deploy-environments). -## Run dbt in production +Learn how to use dbt Cloud's features to help your team ship timely and quality production data more easily. +## Deploy with dbt + +
          + + + + -If you want to run dbt jobs on a schedule, you can use tools such as dbt Cloud, Airflow, Prefect, Dagster, automation server, or Cron. + -## dbt Cloud + -We've built dbt Cloud to empower data teams to easily run dbt in production. If you're interested in trying out dbt Cloud, you can [sign up for an account](https://cloud.getdbt.com/signup/). +

          -dbt Cloud enables you to: -- run your jobs on a schedule -- view logs for any historical invocation of dbt -- configure error notifications -- render your project's documentation +## Monitor jobs and alerts -In general, the dbt Cloud application deployment models fall into two categories: **Multi Tenant** and **Single Tenant**. These deployments are hosted on infrastructure managed by dbt Labs. Both models leverage AWS infrastructure as described in the [Architecture](/docs/deploy/architecture) section. +
          -For more information on these deployment models, refer to: + -- [Multi Tenant](/docs/deploy/multi-tenant) -- [Single Tenant](/docs/deploy/single-tenant) + -If you’re interested in learning more about an Enterprise plan, please [contact us](mailto:sales@getdbt.com). + -## Airflow + -If your organization is using [Airflow](https://airflow.apache.org/), there are a number of ways you can run your dbt jobs, including: + -* Installing the [dbt Cloud Provider](https://registry.astronomer.io/providers/dbt-cloud) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. + - - + -* Invoking dbt Core jobs through the [BashOperator](https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator). In this case, be sure to install dbt into a virtual environment to avoid issues with conflicting dependencies between Airflow and dbt. +

          -For more details on both of these methods, including example implementations, check out [this guide](https://www.astronomer.io/guides/airflow-dbt). -## Prefect + -## Cron -Cron is a decent way to schedule bash commands. However, while it may seem like an easy route to schedule a job, writing code to take care of all of the additional features associated with a production deployment often makes this route more complex compared to other options listed here. +## Related docs -## Related docs -- [What are the dbt commands you run in your production deployment of dbt?](https://discourse.getdbt.com/t/what-are-the-dbt-commands-you-run-in-your-production-deployment-of-dbt/366) +- [Integrate with other orchestration tools](/docs/deploy/deployment-tools) diff --git a/website/docs/docs/deploy/deployment-tools.md b/website/docs/docs/deploy/deployment-tools.md new file mode 100644 index 00000000000..6fba9caf6e8 --- /dev/null +++ b/website/docs/docs/deploy/deployment-tools.md @@ -0,0 +1,139 @@ +--- +title: "Integrate with other orchestration tools" +id: "deployment-tools" +sidebar_label: "Integrate with other tools" +pagination_next: null +--- + +Alongside [dbt Cloud](/docs/deploy/jobs), discover other ways to schedule and run your dbt jobs with the help of tools such as Airflow, Prefect, Dagster, automation server, Cron, and Azure Data Factory (ADF), + +Build and install these tools to automate your data workflows, trigger dbt jobs (including those hosted on dbt Cloud), and enjoy a hassle-free experience, saving time and increasing efficiency. + +## Airflow + +If your organization is using [Airflow](https://airflow.apache.org/), there are a number of ways you can run your dbt jobs, including: + + + + + +Installing the [dbt Cloud Provider](https://airflow.apache.org/docs/apache-airflow-providers-dbt-cloud/stable/index.html) to orchestrate dbt Cloud jobs. This package contains multiple Hooks, Operators, and Sensors to complete various actions within dbt Cloud. + + + + + + + + +Invoking dbt Core jobs through the [BashOperator](https://registry.astronomer.io/providers/apache-airflow/modules/bashoperator). In this case, be sure to install dbt into a virtual environment to avoid issues with conflicting dependencies between Airflow and dbt. + + + + +For more details on both of these methods, including example implementations, check out [this guide](https://docs.astronomer.io/learn/airflow-dbt-cloud). + +## Azure Data Factory + +Integrate dbt Cloud and [Azure Data Factory](https://learn.microsoft.com/en-us/azure/data-factory/) (ADF) for a smooth data process, from data ingestion to data transformation. You can seamlessly trigger dbt Cloud jobs upon completion of ingestion jobs by using the [dbt API](/docs/dbt-cloud-apis/overview) in ADF. Need help building this out? [Contact us](https://www.getdbt.com/contact/) today! + + +The following video provides you with a detailed overview of how to trigger a dbt Cloud job via the API in Azure Data Factory. + + + + +To use the dbt API to trigger a job in dbt Cloud through ADF: + +1. In dbt Cloud, go to the job settings of the daily production job and turn off the scheduled run in the **Trigger** section. +2. You'll want to create a pipeline in ADF to trigger a dbt Cloud job. +3. Securely fetch the dbt Cloud service token from a key vault in ADF, using a web call as the first step in the pipeline. +4. Set the parameters in the pipeline, including the dbt Cloud account ID and job ID, as well as the name of the key vault and secret that contains the service token. + * You can find the dbt Cloud job and account id in the URL, for example, if your URL is `https://cloud.getdbt.com/deploy/88888/projects/678910/jobs/123456`, the account ID is 88888 and the job ID is 123456 +5. Trigger the pipeline in ADF to start the dbt Cloud job and monitor the status of the dbt Cloud job in ADF. +6. In dbt Cloud, you can check the status of the job and how it was triggered in dbt Cloud. + + +## Prefect + +If your organization is using [Prefect](https://www.prefect.io/), the way you will run your jobs depends on the dbt version you're on, and whether you're orchestrating dbt Cloud or dbt Core jobs. Refer to the following variety of options: + + + + +### Prefect 2 + + + + + +- Use the [trigger_dbt_cloud_job_run_and_wait_for_completion](https://prefecthq.github.io/prefect-dbt/cloud/jobs/#prefect_dbt.cloud.jobs.trigger_dbt_cloud_job_run_and_wait_for_completion) flow. +- As jobs are executing, you can poll dbt to see whether or not the job completes without failures, through the [Prefect user interface (UI)](https://docs.prefect.io/ui/overview/). + + + + + + + + +- Use the [trigger_dbt_cli_command](https://prefecthq.github.io/prefect-dbt/cli/commands/#prefect_dbt.cli.commands.trigger_dbt_cli_command) task. +- For details on both of these methods, see [prefect-dbt docs](https://prefecthq.github.io/prefect-dbt/). + + + + + +### Prefect 1 + + + + + +- Trigger dbt Cloud jobs with the [DbtCloudRunJob](https://docs.prefect.io/api/latest/tasks/dbt.html#dbtcloudrunjob) task. +- Running this task will generate a markdown artifact viewable in the Prefect UI. +- The artifact will contain links to the dbt artifacts generated as a result of the job run. + + + + + +- Use the [DbtShellTask](https://docs.prefect.io/api/latest/tasks/dbt.html#dbtshelltask) to schedule, execute, and monitor your dbt runs. +- Use the supported [ShellTask](https://docs.prefect.io/api/latest/tasks/shell.html#shelltask) to execute dbt commands through the shell. + + + + + + +## Dagster + +If your organization is using [Dagster](https://dagster.io/), you can use the [dagster_dbt](https://docs.dagster.io/_apidocs/libraries/dagster-dbt) library to integrate dbt commands into your pipelines. This library supports the execution of dbt through dbt Cloud, dbt CLI and the dbt RPC server. Running dbt from Dagster automatically aggregates metadata about your dbt runs. Refer to the [example pipeline](https://dagster.io/blog/dagster-dbt) for details. + +## Kestra + +If your organization uses [Kestra](http://kestra.io/), you can leverage the [dbt plugin](https://kestra.io/plugins/plugin-dbt) to orchestrate dbt Cloud and dbt Core jobs. Kestra's user interface (UI) has built-in [Blueprints](https://kestra.io/docs/user-interface-guide/blueprints), providing ready-to-use workflows. Navigate to the Blueprints page in the left navigation menu and [select the dbt tag](https://demo.kestra.io/ui/blueprints/community?selectedTag=36) to find several examples of scheduling dbt CLI commands and dbt Cloud jobs as part of your data pipelines. After each scheduled or ad-hoc workflow execution, the Outputs tab in the Kestra UI allows you to download and preview all dbt build artifacts. The Gantt and Topology view additionally render the metadata to visualize dependencies and runtimes of your dbt models and tests. The dbt Cloud task provides convenient links to easily navigate between Kestra and dbt Cloud UI. + +## Automation servers + +Automation servers, like CodeDeploy, GitLab CI/CD ([video](https://youtu.be/-XBIIY2pFpc?t=1301)), Bamboo and Jenkins, can be used to schedule bash commands for dbt. They also provide a UI to view logging to the command line, and integrate with your git repository. + +## Cron + +Cron is a decent way to schedule bash commands. However, while it may seem like an easy route to schedule a job, writing code to take care of all of the additional features associated with a production deployment often makes this route more complex compared to other options listed here. + +## Databricks workflows + +Use Databricks workflows to call the dbt Cloud job API, which has several benefits such as integration with other ETL processes, utilizing dbt Cloud job features, separation of concerns, and custom job triggering based on custom conditions or logic. These advantages lead to more modularity, efficient debugging, and flexibility in scheduling dbt Cloud jobs. + +For more info, refer to the guide on [Databricks workflows and dbt Cloud jobs](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). + + + +## Related docs + +- [dbt Cloud plans and pricing](https://www.getdbt.com/pricing/) +- [Quickstart guides](/quickstarts) +- [Webhooks for your jobs](/docs/deploy/webhooks) +- [Orchestration guides](https://docs.getdbt.com/guides/orchestration) +- [Commands for your production deployment](https://discourse.getdbt.com/t/what-are-the-dbt-commands-you-run-in-your-production-deployment-of-dbt/366) diff --git a/website/docs/docs/deploy/job-commands.md b/website/docs/docs/deploy/job-commands.md new file mode 100644 index 00000000000..db284c78a05 --- /dev/null +++ b/website/docs/docs/deploy/job-commands.md @@ -0,0 +1,81 @@ +--- +title: "Job commands" +id: "job-commands" +description: "How to use dbt commands to set tasks for your dbt Cloud jobs." +--- + +A dbt Cloud production job allows you to set up a system to run a dbt job and job commands on a schedule, rather than running dbt commands manually from the command line or [IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). A job consists of commands that are "chained" together and executed as run steps. Each run step can succeed or fail, which may determine the job's run status (Success, Cancel, or Error). + +Each job allows you to: + +- Configure job commands +- View job run details, including timing, artifacts, and detailed run steps +- Access logs to view or help debug issues and historical invocations of dbt +- Set up notifications, and [more](/docs/deploy/deployments#dbt-cloud) + +## Job command types + +Job commands are specific tasks executed by the job, and you can configure them seamlessly by either adding [dbt commands](/reference/dbt-commands) or using the checkbox option in the **Commands** section. + +During a job run, the commands are "chained" together and executed as run steps. When you add a dbt command in the **Commands** section, you can expect different outcomes compared to the checkbox option. + + + + +### Built-in commands + +Every job invocation automatically includes the [`dbt deps`](/reference/commands/deps) command, meaning you don't need to add it to the **Commands** list in your job settings. You will also notice every job will include a run step to reclone your repository and connect to your data platform, which can affect your job status if these run steps aren't successful. + +**Job outcome** — During a job run, the built-in commands are "chained" together. This means if one of the run steps in the chain fails, then the next commands aren't executed, and the entire job fails with an "Error" job status. + + + + +### Checkbox commands + +For every job, you have the option to select the [Generate docs on run](/docs/collaborate/build-and-view-your-docs) or [Run source freshness](/docs/deploy/source-freshness) checkboxes, enabling you to run the commands automatically. + +**Job outcome Generate docs on run checkbox** — dbt Cloud executes the `dbt docs generate` command, _after_ the listed commands. If that particular run step in your job fails, the job can still succeed if all subsequent run steps are successful. Read [Build and view your docs](/docs/collaborate/build-and-view-your-docs) for more info. + +**Job outcome Source freshness checkbox** — dbt Cloud executes the `dbt source freshness` command as the first run step in your job. If that particular run step in your job fails, the job can still succeed if all subsequent run steps are successful. Read [Source freshness](/docs/deploy/source-freshness) for more info. + +### Command list + +You can add or remove as many [dbt commands](/reference/dbt-commands) as necessary for every job. However, you need to have at least one dbt command. There are few commands listed as "dbt Core" in the [dbt Command reference doc](/reference/dbt-commands) page. This means they are meant for use in [dbt Core](/docs/core/about-dbt-core) only and are not available in dbt Cloud. + + +:::tip Using selectors + +Use [selectors](/reference/node-selection/syntax) as a powerful way to select and execute portions of your project in a job run. For example, to run tests for one_specific_model, use the selector: `dbt test --select one_specific_model`. The job will still run if a selector doesn't match any models. + +::: + + +**Job outcome** — During a job run, the commands are "chained" together and executed as run steps. If one of the run steps in the chain fails, then the subsequent steps aren't executed, and the job will fail. + +In the following example image, the first four run steps are successful. However, if the fifth run step (`dbt run --select state:modified+ --full-refresh --fail-fast`) fails, then the next run steps aren't executed, and the entire job fails. The failed job returns a non-zero [exit code](/reference/exit-codes) and "Error" job status: + + + +## Job command failures + +Job command failures can mean different things for different commands. Some common reasons why a job command may fail: + +- **Failure at`dbt run`** — [`dbt run`](/reference/commands/run) executes compiled SQL model files against the current target database. It will fail if there is an error in any of the built models. Tests on upstream resources prevent downstream resources from running and a failed test will skip them. + +- **Failure at `dbt test`** — [`dbt test`](/reference/commands/test) runs tests defined on models, sources, snapshots, and seeds. A test can pass, fail, or warn depending on its [severity](reference/resource-configs/severity). Unless you set [warnings as errors](/reference/global-configs/warnings), only an error stops the next step. + +- **Failure at `dbt build`** — [`dbt build`](/reference/commands/build) runs models, tests, snapshots, and seeds. This command executes resources in the DAG-specified order. If any upstream resource fails, all downstream resources are skipped, and the command exits with an error code of 1. + +- **Selector failures** + - If a [`select`](/reference/node-selection/set-operators) matches multiple nodes and one of the nodes fails, then the job will have an exit code `1` and the subsequent command will fail. If you specified the [`—fail-fast`](/reference/global-configs/failing-fast) flag, then the first failure will stop the entire connection for any models that are in progress. + + - If a selector doesn't match any nodes, it's not considered a failure. + + +## Related docs +- [Job creation best practices](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) +- [dbt Command reference](/reference/dbt-commands) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) +- [Build and view your docs](/docs/collaborate/build-and-view-your-docs) diff --git a/website/docs/docs/deploy/job-notifications.md b/website/docs/docs/deploy/job-notifications.md index f6efb0223ed..8d242abac78 100644 --- a/website/docs/docs/deploy/job-notifications.md +++ b/website/docs/docs/deploy/job-notifications.md @@ -4,23 +4,30 @@ id: "job-notifications" description: "Set up notifications in dbt Cloud to receive Email or Slack alerts for job run status." --- -### Overview Setting up notifications in dbt Cloud will allow you to receive alerts via Email or a chosen Slack channel when a job run succeeds, fails, or is cancelled. ### Email -There are two options for setting up email notifications. As a **user**, you can set up email notifications for yourself under your Profile. As an **admin**, you can set up notifications on behalf of your team members. +These are the following options for setting up email notifications. Refer to [Users and licenses](/docs/cloud/manage-access/seats-and-users) for info on license types eligible for email notifications. -1. Click the gear in the top right and select **Notification settings**. +- As a **user** — You can set up email notifications for yourself under your Profile. +- As an **admin** — You can set up notifications on behalf of your team members. -2. **As a user:** Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which you would like to be notified, or +To set up job notifications, follow these steps: - **As an admin:** Select one or more users you'd like to set notifications for. If you only see your own name, then you might not have admin privileges. Select **Edit** and select the type of Notification (Succeeds, Fails, or Is Cancelled) for each Job for which they will be notified. +1. Click the gear menu in the top right corner and select **Notification Settings**. + +2. Select **Edit** to begin editing the **Email Notifications** settings. + - **As a user:** Choose the Notification type (Succeeds, Fails, or Is Cancelled) for each Job you want to receive notifications for. + + - **As an admin:** Under **Configure notifications for**, use the dropdown to select one or more users you'd like to set notifications for. If you only see your own name, then you might not have admin privileges.

          + Choose the Notification type (Succeeds, Fails, or Is Cancelled) for each Job you want them to receive notifications for. 3. Click **Save**. - + + ### Slack - + diff --git a/website/docs/docs/deploy/job-scheduler.md b/website/docs/docs/deploy/job-scheduler.md new file mode 100644 index 00000000000..fba76f677a7 --- /dev/null +++ b/website/docs/docs/deploy/job-scheduler.md @@ -0,0 +1,92 @@ +--- +title: "Job scheduler" +id: "job-scheduler" +sidebar_label: "Job scheduler" +description: "The dbt Cloud job scheduler queues scheduled or API-triggered runs, before preparing the job to enter cloud data platform. Build observability into transformation workflows with the in-app scheduling, logging, and alerting." +tags: [scheduler] +--- + +The job scheduler is the backbone of running jobs in dbt Cloud, bringing power and simplicity to building data pipelines in both continuous integration and production contexts. The scheduler frees teams from having to build and maintain their own infrastructure, and ensures the timeliness and reliability of data transformations. + +The scheduler enables both cron-based and event-driven execution of dbt commands in the user’s data platform. Specifically, it handles: + +- Cron-based execution of dbt Cloud jobs that run on a predetermined cadence +- Event-driven execution of dbt Cloud CI jobs triggered by pull requests to the dbt repo +- Event-driven execution of dbt Cloud jobs triggered by API +- Event-driven execution of dbt Cloud jobs manually triggered by a user to "Run Now" + +The scheduler handles various tasks including queuing jobs, creating temporary environments to run the dbt commands required for those jobs, providing logs for debugging and remediation, and storing dbt artifacts for direct consumption/ingestion by the Discovery API. + +The scheduler powers running dbt in staging and production environments, bringing ease and confidence to CI/CD workflows and enabling observability and governance in deploying dbt at scale. + +## Scheduler terms + +Familiarize yourself with these useful terms to help you understand how the job scheduler works. + +| Term | Definition | +| --- | --- | +| Scheduler | The dbt Cloud engine that powers job execution. The scheduler queues scheduled or API-triggered job runs, prepares an environment to execute job commands in your cloud data platform, and stores and serves logs and artifacts that are byproducts of run execution. | +| Job | A collection of run steps, settings, and a trigger to invoke dbt commands against a project in the user's cloud data platform. | +| Job queue | The job queue acts as a waiting area for job runs when they are scheduled or triggered to run; runs remain in queue until execution begins. More specifically, the Scheduler checks the queue for runs that are due to execute, ensures the run is eligible to start, and then prepares an environment with appropriate settings, credentials, and commands to begin execution. Once execution begins, the run leaves the queue. | +| Over-scheduled job | A situation when a cron-scheduled job's run duration becomes longer than the frequency of the job’s schedule, resulting in a job queue that will grow faster than the scheduler can process the job’s runs. | +| Prep time | The time dbt Cloud takes to create a short-lived environment to execute the job commands in the user's cloud data platform. Prep time varies most significantly at the top of the hour when the dbt Cloud Scheduler experiences a lot of run traffic. | +| Run | A single, unique execution of a dbt job. | +| Run slot | Run slots control the number of jobs that can run concurrently. Developer and Team plan accounts have a fixed number of run slots, and Enterprise users have [unlimited run slots](/docs/dbt-versions/release-notes/July-2023/faster-run#unlimited-job-concurrency-for-enterprise-accounts). Each running job occupies a run slot for the duration of the run. If you need more jobs to execute in parallel, consider the [Enterprise plan](https://www.getdbt.com/pricing/) | +| Threads | When dbt builds a project's DAG, it tries to parallelize the execution by using threads. The [thread](/docs/running-a-dbt-project/using-threads) count is the maximum number of paths through the DAG that dbt can work on simultaneously. The default thread count in a job is 4. | +| Wait time | Amount of time that dbt Cloud waits before running a job, either because there are no available slots or because a previous run of the same job is still in progress. | + + +## Scheduler queue + +The scheduler queues a deployment job to be processed when it's triggered to run because of a [set schedule](#create-and-schedule-jobs), an API call, or manual action. + +Before the job starts executing, the scheduler checks these conditions to determine if the run can start executing: + +- **Is there a run slot that's available on the account for use?** — If all run slots are occupied, the queued run will wait. The wait time is displayed in dbt Cloud. If there are long wait times, [upgrading to Enterprise](https://www.getdbt.com/contact/) can provide more run slots and allow for higher job concurrency. + +- **Does this same job have a run already in progress?** — The scheduler executes distinct runs of the same dbt Cloud job serially to avoid model build collisions. If there's a job already running, the queued job will wait, and the wait time will be displayed in dbt Cloud. + +If there is an available run slot and there isn't an actively running instance of the job, the scheduler will prepare the job to run in your cloud data platform. This prep involves readying a Kubernetes pod with the right version of dbt installed, setting environment variables, loading data platform credentials, and Git provider authorization, amongst other environment-setting tasks. The time it takes to prepare the job is displayed as **prep time** in the UI. + +Together, **wait time** plus **prep time** is the total time a run spends in the queue (or **Time in queue**). + + + +### Treatment of CI jobs +When compared to deployment jobs, the scheduler behaves differently when handling [continuous integration (CI) jobs](/docs/deploy/continuous-integration). It queues a CI job to be processed when it's triggered to run by a Git pull request, and the conditions the scheduler checks to determine if the run can start executing are also different: + +- **Will the CI run consume a run slot?** — CI runs don't consume run slots and will never block production runs. +- **Does this same job have a run already in progress?** — CI runs can execute concurrently (in parallel). CI runs build into unique temporary schemas, and CI checks execute in parallel to help increase team productivity. Teammates never have to wait to get a CI check review. + +## Job memory + +In dbt Cloud, the setting to provision memory available to a job is defined at the account-level and applies to each job running in the account; the memory limit cannot be customized per job. If a running job reaches its memory limit, the run is terminated with a "memory limit error" message. + +Jobs consume a lot of memory in the following situations: +- A high thread count was specified +- Custom dbt macros attempt to load data into memory instead of pushing compute down to the cloud data platform +- Having a job that generates dbt project documentation for a large and complex dbt project. + * To prevent problems with the job running out of memory, we recommend generating documentation in a separate job that is set aside for that task and removing `dbt docs generate` from all other jobs. This is especially important for large and complex projects. + +Refer to [dbt Cloud architecture](/docs/cloud/about-cloud/architecture) for an architecture diagram and to learn how the data flows. + +## Run cancellation for over-scheduled jobs + +:::info Scheduler won't cancel API-triggered jobs +The scheduler will not cancel over-scheduled jobs triggered by the [API](/docs/dbt-cloud-apis/overview). +::: + +The dbt Cloud scheduler prevents too many job runs from clogging the queue by canceling unnecessary ones. If a job takes longer to run than its scheduled frequency, the queue will grow faster than the scheduler can process the runs, leading to an ever-expanding queue with runs that don’t need to be processed (called _over-scheduled jobs_). + +The scheduler prevents queue clog by canceling runs that aren't needed, ensuring there is only one run of the job in the queue at any given time. If a newer run is queued, the scheduler cancels any previously queued run for that job and displays an error message. + + + +To prevent over-scheduling, users will need to take action by either refactoring the job so it runs faster or modifying its [schedule](/docs/deploy/deploy-jobs#schedule-days). + +## Related docs +- [dbt Cloud architecture](/docs/cloud/about-cloud/architecture#dbt-cloud-features-architecture) +- [Job commands](/docs/deploy/job-commands) +- [Job notifications](/docs/deploy/job-notifications) +- [Webhooks](/docs/deploy/webhooks) +- [dbt Cloud continuous integration](/docs/deploy/continuous-integration) diff --git a/website/docs/docs/deploy/job-triggers.md b/website/docs/docs/deploy/job-triggers.md deleted file mode 100644 index bf6f6ac06d5..00000000000 --- a/website/docs/docs/deploy/job-triggers.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -title: "Job triggers" -id: "job-triggers" -description: "You can use cron syntax to specify when you want to run a job." ---- - -### Overview - -In dbt Cloud, you can use "cron" syntax to specify when you'd like your job to run. Cron syntax is very expressive, and allows you to completely customize your run schedule. - -If you need help coming up with the right cron syntax, we recommend using a tool like `crontab.guru`. There, you can enter cron snippets and see what they mean in plain English. You can also find some example snippets below. - -### Examples - -- `0 * * * *`: Every hour, at minute 0 -- `*/5 * * * *`: Every 5 minutes -- `5 4 * * *`: At exactly 4:05 AM UTC -- `30 */4 * * *`: At minute 30 past every 4th hour (e.g. 4:30AM, 8:30AM, 12:30PM, etc., all UTC) -- `0 0 */2 * *`: At midnight UTC every other day -- `0 0 * * 1`: At midnight UTC every Monday. - -A custom cron schedule can be specified in the Job Settings page when you edit a job: - -1. Select a job. -2. Click **Settings**. -3. Click **Edit**. -4. In the Triggers section, activate the **Run on schedule** option. -5. Select **Enter custom cron schedule**. -6. Enter the custom cron syntax for the schedule you want. - - diff --git a/website/docs/docs/deploy/jobs.md b/website/docs/docs/deploy/jobs.md new file mode 100644 index 00000000000..92390907cd0 --- /dev/null +++ b/website/docs/docs/deploy/jobs.md @@ -0,0 +1,22 @@ +--- +title: "Jobs in dbt Cloud" +sidebar_label: "About Jobs" +description: "Learn about deploy jobs and continuous integration (CI) jobs in dbt Cloud and what their differences are." +tags: [scheduler] +pagination_next: "docs/deploy/deploy-jobs" +--- + +In dbt Cloud, there are two types of jobs: +- [Deploy jobs](/docs/deploy/deploy-jobs) — To create and set up triggers for building production data assets +- [Continuous integration (CI) jobs](/docs/deploy/continuous-integration) — To create and set up triggers for checking code changes + +Below is a comparison table that describes how deploy jobs and CI jobs behave differently: + +| | Deploy Jobs | CI Jobs | +| --- | --- | --- | +| Purpose | Builds production data assets. | Builds and tests new code before merging changes into production. | +| Trigger types | Triggered by a schedule or by API. | Triggered by a commit to a PR or by API. | +| Destination | Builds into a production database and schema. | Builds into a staging database and ephemeral schema, lived for the lifetime of the PR. | +| Execution mode | Runs execute sequentially, so as to not have collisions on the underlying DAG. | Runs execute in parallel to promote team velocity. | +| Efficiency run savings | Detects over-scheduled jobs and cancels unnecessary runs to avoid queue clog. | Cancels existing runs when a newer commit is pushed to avoid redundant work. | +| State comparison | Only sometimes needs to detect state. | Almost always needs to compare state against the production environment to build on modified code and its dependents. | \ No newline at end of file diff --git a/website/docs/docs/deploy/monitor-jobs.md b/website/docs/docs/deploy/monitor-jobs.md new file mode 100644 index 00000000000..45156bb341c --- /dev/null +++ b/website/docs/docs/deploy/monitor-jobs.md @@ -0,0 +1,30 @@ +--- +title: "Monitor jobs and alerts" +id: "monitor-jobs" +description: "Monitor your dbt Cloud job and set up alerts to ensure seamless orchestration and optimize your data transformations" +tags: ["scheduler"] +pagination_next: "docs/deploy/run-visibility" +--- + +Monitor your dbt Cloud jobs to help identify improvement and set up alerts to proactively alert the right people or team. + +This portion of our documentation will go over dbt Cloud's various capabilities that help you monitor your jobs and set up alerts to ensure seamless orchestration, including: + +- [Run visibility](/docs/deploy/run-visibility) — View your run history to help identify where improvements can be made to scheduled jobs. +- [Retry jobs](/docs/deploy/retry-jobs) — Rerun your errored jobs from start or the failure point. +- [Job notifications](/docs/deploy/job-notifications) — Receive email or slack notifications when a job run succeeds, fails, or is canceled. +- [Webhooks](/docs/deploy/webhooks) — Use webhooks to send events about your dbt jobs' statuses to other systems. +- [Leverage artifacts](/docs/deploy/artifacts) — dbt Cloud generates and saves artifacts for your project, which it uses to power features like creating docs for your project and reporting freshness of your sources. +- [Source freshness](/docs/deploy/source-freshness) — Monitor data governance by enabling snapshots to capture the freshness of your data sources. +- [Dashboard status tiles](/docs/deploy/dashboard-status-tiles) — Set up and add status tiles to view data freshness and quality checks + + + + + + + + + + + diff --git a/website/docs/docs/deploy/multi-tenant.md b/website/docs/docs/deploy/multi-tenant.md deleted file mode 100644 index 94399ff4700..00000000000 --- a/website/docs/docs/deploy/multi-tenant.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Multi tenant -id: multi-tenant ---- - -The Multi Tenant (SaaS) deployment environment refers to the SaaS dbt Cloud application hosted by dbt Labs. This is the most commonly used deployment and is completely managed and maintained by dbt Labs, the makers of dbt. As a SaaS product, a user can quickly [create an account](https://www.getdbt.com/signup/) and get started using the product. The deployment is hosted in AWS and will always contain the latest software updates and bug fixes. - -For more information about the dbt Cloud Production deployment see the below. - -- [Application Data Flows](/docs/deploy/architecture#application-data-flows) -- [Hosted Network Architecture](/docs/deploy/architecture#hosted-network-architecture) diff --git a/website/docs/docs/deploy/regions.md b/website/docs/docs/deploy/regions.md deleted file mode 100644 index 004ed9c70aa..00000000000 --- a/website/docs/docs/deploy/regions.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: "Regions" -id: "regions" -description: "Available regions" ---- - -dbt Cloud is hosted in the multiple regions and will always connect to your data platform from the below IP addresses. Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. - -[dbt Cloud Enterprise](https://www.getdbt.com/pricing/) plans can choose to have their account hosted in any of the below regions. Organizations **must** choose a single region per dbt Cloud account. If you need to run dbt Cloud in multiple regions, we recommend using multiple dbt Cloud accounts. - - -| Region | Location | Access URL | IP addresses | Developer plan | Team plan | Enterprise plan | -|--------|----------|------------|--------------|-----------------|------------|------------------| -| North America | us-east-1 (N. Virginia) | cloud.getdbt.com | 52.45.144.63
          54.81.134.249
          52.22.161.231 | ✅ | ✅ | ✅ | -| EMEA | eu-central-1 (Frankfurt) | emea.dbt.com | 3.123.45.39
          3.126.140.248
          3.72.153.148 | ❌ | ❌ | ✅ | -| Virtual Private dbt | Customized | Customized | Ask [Support](/guides/legacy/getting-help#dbt-cloud-support) for your IPs | ❌ | ❌ | ✅ | - - - diff --git a/website/docs/docs/deploy/retry-jobs.md b/website/docs/docs/deploy/retry-jobs.md new file mode 100644 index 00000000000..ea616121f38 --- /dev/null +++ b/website/docs/docs/deploy/retry-jobs.md @@ -0,0 +1,32 @@ +--- +title: "Retry your dbt jobs" +sidebar_label: "Retry jobs" +description: "Rerun your errored jobs from start or the failure point." +--- + +If your dbt job run completed with a status of **Error**, you can rerun it from start or from the point of failure in dbt Cloud. + +## Prerequisites + +- You have a [dbt Cloud account](https://www.getdbt.com/signup). +- You must be using [dbt version](/docs/dbt-versions/upgrade-core-in-cloud) 1.6 or newer. +- The most recent run of the job hasn't completed successfully. The latest status of the run is **Error**. + - The job command that failed in the run must be one that supports the [retry command](/reference/commands/retry). + +## Rerun an errored job + +1. Select **Deploy** from the top navigation bar and choose **Run History.** +2. Choose the job run that has errored. +3. In the **Run Summary** tab on the job’s **Run** page, expand the run step that failed. An :x: denotes the failed step. +4. Examine the error message and determine how to fix it. After you have made your changes, save and commit them to your [Git repo](/docs/collaborate/git-version-control). +5. Return to your job’s **Run** page. In the upper right corner, click **Rerun** and choose **Rerun from start** or **Rerun from failure**. + + If you chose to rerun from the failure point, a **Rerun failed steps** modal opens. The modal lists the run steps that will be invoked: the failed step and any skipped steps. To confirm these run steps, click **Rerun from failure**. The job reruns from the failed command in the previously failed run. A banner at the top of the **Run Summary** tab captures this with the message, "This run resumed execution from last failed step". + + + +## Related content +- [Retry a failed run for a job](/dbt-cloud/api-v2#/operations/Retry%20a%20failed%20run%20for%20a%20job) API endpoint +- [Run visibility](/docs/deploy/run-visibility) +- [Jobs](/docs/deploy/jobs) +- [Job commands](/docs/deploy/job-commands) \ No newline at end of file diff --git a/website/docs/docs/deploy/run-visibility.md b/website/docs/docs/deploy/run-visibility.md new file mode 100644 index 00000000000..ff9abfa5b0b --- /dev/null +++ b/website/docs/docs/deploy/run-visibility.md @@ -0,0 +1,35 @@ +--- +title: "Run visibility" +description: "Monitor your jobs to make performance improvements." +tags: ["scheduler"] +--- + +You can view the history of your runs and the model timing dashboard to help identify where improvements can be made to jobs. + + +## Run history + +The **Run History** dashboard in dbt Cloud helps you monitor the health of your dbt project. It provides a detailed overview of all of your project's job runs and empowers you with a variety of filters to help you focus on specific aspects. You can also use it to review recent runs, find errored runs, and track the progress of runs in progress. You can access it on the top navigation menu by clicking **Deploy** and then **Run History**. + +The dashboard displays your full run history, including job name, status, associated environment, job trigger, commit SHA, schema, and timing info. + +dbt Cloud developers can access their run history for the last 365 days through the dbt Cloud user interface (UI) and API. + +We limit self-service retrieval of run history metadata to 365 days to improve dbt Cloud's performance. For more info on the run history retrieval change, refer to [Older run history retrieval change](/docs/dbt-versions/release-notes/May-2023/run-history-endpoint). + + + +## Access logs + +You can view or download in-progress and historical logs for your dbt runs. This makes it easier for the team to debug errors more efficiently. + + + +## Model timing +> Available on [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) dbt Cloud accounts on the [Team or Enterprise plans](https://www.getdbt.com/pricing/). + +The model timing dashboard on dbt Cloud displays the composition, order, and time taken by each model in a job run. The visualization appears for successful jobs and highlights the top 1% of model durations. This helps you identify bottlenecks in your runs, so you can investigate them and potentially make changes to improve their performance. + +You can find the dashboard on the **Run Overview** page. + + diff --git a/website/docs/docs/deploy/single-tenant.md b/website/docs/docs/deploy/single-tenant.md deleted file mode 100644 index 54cd764dadf..00000000000 --- a/website/docs/docs/deploy/single-tenant.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Single tenant -id: single-tenant ---- - -The Single Tenant deployment environment provides a hosted alternative to the Multi Tenant (SaaS) dbt Cloud environment. While still managed and maintained by dbt Labs, the Single Tenant environment provides dedicated infrastructure with one or more instances of dbt Cloud that can only be accessed by a single customer. This is accomplished by spinning up all the necessary infrastructure with a re-usable Infrastructure as Code (IaC) deployment built with [Terraform](https://www.terraform.io/). The Single Tenant infrastructure lives in a dedicated AWS account and can be customized with certain configurations such as Firewall rules to limit ingress traffic or hosting in a specific AWS Region. - -A few common reasons for choosing a Single Tenant deployment over the Production SaaS product include: -- A requirement that the dbt Cloud application be hosted in a dedicated VPC that is logically separated from other customer infrastructure -- A desire for multiple isolated dbt Cloud instances for testing, development, etc - -_To learn more about setting up a dbt Cloud Single Tenant deployment, [please contact our sales team](mailto:sales@getdbt.com)._ - -For more information about the dbt Cloud Single Tenant deployment see the below. - -- [Application Data Flows](/docs/deploy/architecture#application-data-flows) -- [Hosted Network Architecture](/docs/deploy/architecture#hosted-network-architecture) diff --git a/website/docs/docs/deploy/source-freshness.md b/website/docs/docs/deploy/source-freshness.md index 5056e258eda..78500416c56 100644 --- a/website/docs/docs/deploy/source-freshness.md +++ b/website/docs/docs/deploy/source-freshness.md @@ -4,39 +4,31 @@ id: "source-freshness" description: "Validate that data freshness meets expectations and alert if stale." --- -## Data Source Freshness - -dbt Cloud provides a helpful interface around dbt's [source data freshness](/docs/build/sources#snapshotting-source-data-freshness) calculations. When a dbt Cloud job is configured to snapshot source data freshness, dbt Cloud will render a user interface showing you the state of the most recent snapshot. This interface is intended to help you determine if your source data freshness is meeting the SLAs that you've defined for your organization. +dbt Cloud provides a helpful interface around dbt's [source data freshness](/docs/build/sources#snapshotting-source-data-freshness) calculations. When a dbt Cloud job is configured to snapshot source data freshness, dbt Cloud will render a user interface showing you the state of the most recent snapshot. This interface is intended to help you determine if your source data freshness is meeting the service level agreement (SLA) that you've defined for your organization. ### Enabling source freshness snapshots -First, make sure to configure your sources to [snapshot freshness information](/docs/build/sources#snapshotting-source-data-freshness). - - - - - **v0.21.0:** Renamed `dbt source snapshot-freshness` to `dbt source freshness`. If using an older version of dbt, the command is `snapshot-freshness`. - To have dbt Cloud display data source freshness as a rendered user interface, you will still need to use the pre-v0.21 syntax of `dbt source snapshot-freshness`. - - +[`dbt build`](reference/commands/build) does _not_ include source freshness checks when building and testing resources in your DAG. Instead, you can use one of these common patterns for defining jobs: +- Add `dbt build` to the run step to run models, tests, and so on. +- Select the **Generate docs on run** checkbox to automatically [generate project docs](/docs/collaborate/build-and-view-your-docs#set-up-a-documentation-job). +- Select the **Run on source freshness** checkbox to enable [source freshness](#checkbox) as the first to step of the job. -Then, to enable source freshness snapshots in dbt Cloud, add a `dbt source freshness` step to one of your jobs, or create a new job to snapshot source freshness. **Note:** If you're using an older version of dbt Core (before v0.21), you'll need to use the old name of this command instead: `dbt source snapshot-freshness`. See [`source` command docs](commands/source) for details. + - +To enable source freshness snapshots, firstly make sure to configure your sources to [snapshot freshness information](/docs/build/sources#snapshotting-source-data-freshness). You can add source freshness to the list of commands in the job run steps or enable the checkbox. However, you can expect different outcomes when you configure a job by selecting the **Run source freshness** checkbox compared to adding the command to the run steps. -You can add `dbt source freshness` anywhere in your list of run steps, but note that if your source data is out of date, this step will "fail', and subsequent steps will not run. dbt Cloud will trigger email notifications (if configured) based on the end state of this step. +Review the following options and outcomes: -If you *do not* want your models to run if your source data is out of date, then it could be a good idea to run `dbt source freshness` as the first step in your job. Otherwise, we recommend adding `dbt source freshness` as the last step in the job, or creating a separate job just for this task. +| Options | Outcomes | +|--------| ------- | +| **Select checkbox ** | The **Run source freshness** checkbox in your **Execution Settings** will run `dbt source freshness` as the first step in your job and won't break subsequent steps if it fails. If you wanted your job dedicated *exclusively* to running freshness checks, you still need to include at least one placeholder step, such as `dbt compile`. | +| **Add as a run step** | Add the `dbt source freshness` command to a job anywhere in your list of run steps. However, if your source data is out of date — this step will "fail', and subsequent steps will not run. dbt Cloud will trigger email notifications (if configured) based on the end state of this step.

          You can create a new job to snapshot source freshness.

          If you *do not* want your models to run if your source data is out of date, then it could be a good idea to run `dbt source freshness` as the first step in your job. Otherwise, we recommend adding `dbt source freshness` as the last step in the job, or creating a separate job just for this task. | -Another option is to select the source freshness checkbox in your execution settings when you configure a job on dbt cloud. Selecting this checkbox will run `dbt source freshness` as the first step in your job, but it will not break subsequent steps if it fails. If you wanted your job dedicated *exclusively* to running freshness checks, you still need to include at least one placeholder step, such as `dbt compile`. -Remember that `dbt build` does _not_ include source freshness checks when it builds and tests resources in your DAG. As such, here's a common pattern for defining jobs: -- `dbt build` as the run step -- check box for generating docs -- check box for source freshness + - ### Source freshness snapshot frequency It's important that your freshness jobs run frequently enough to snapshot data latency in accordance with your SLAs. You can imagine that if you have a 1 hour SLA on a particular dataset, snapshotting the freshness of that once daily would not be appropriate. As a good rule of thumb, you should run your source freshness jobs with at least double the frequency of your lowest SLA. Here's an example table of some reasonable snapshot frequencies given typical SLAs: @@ -49,4 +41,4 @@ It's important that your freshness jobs run frequently enough to snapshot data l ## Further reading -For more on exposing links to the latest documentation and sharing source freshness reports to your team, see [Building and configuring artifacts](artifacts). +For more on exposing links to the latest documentation and sharing source freshness reports to your team, see [Building and configuring artifacts](/docs/deploy/artifacts). diff --git a/website/docs/docs/deploy/webhooks.md b/website/docs/docs/deploy/webhooks.md new file mode 100644 index 00000000000..069e7a3e283 --- /dev/null +++ b/website/docs/docs/deploy/webhooks.md @@ -0,0 +1,553 @@ +--- +title: "Webhooks for your jobs" +sidebar_label: "Webhooks" +description: "Get real-time notifications about your dbt jobs with webhooks." +--- + +With dbt Cloud, you can create outbound webhooks to send events (notifications) about your dbt jobs to your other systems. Your other systems can listen for (subscribe to) these events to further automate your workflows or to help trigger automation flows you have set up. + +A webhook is an HTTP-based callback function that allows event-driven communication between two different web applications. This allows you to get the latest information on your dbt jobs in real time. Without it, you would need to make API calls repeatedly to check if there are any updates that you need to account for (polling). Because of this, webhooks are also called _push APIs_ or _reverse APIs_ and are often used for infrastructure development. + +dbt Cloud sends a JSON payload to your application's endpoint URL when your webhook is triggered. You can send a [Slack](/guides/orchestration/webhooks/zapier-slack) notification, a [Microsoft Teams](/guides/orchestration/webhooks/zapier-ms-teams) notification, [open a PagerDuty incident](/guides/orchestration/webhooks/serverless-pagerduty) when a dbt job fails, [and more](/guides/orchestration/webhooks). + +You can create webhooks for these events from the [dbt Cloud web-based UI](#create-a-webhook-subscription) and by using the [dbt Cloud API](#api-for-webhooks): + +- `job.run.started` — Run started. +- `job.run.completed` — Run completed. This can be a run that has failed or succeeded. +- `job.run.errored` — Run errored. + +dbt Cloud retries sending each event five times. dbt Cloud keeps a log of each webhook delivery for 30 days. Every webhook has its own **Recent Deliveries** section, which lists whether a delivery was successful or failed at a glance. + +A webhook in dbt Cloud has a timeout of 10 seconds. This means that if the endpoint doesn't respond within 10 seconds, the webhook processor will time out. This can result in a situation where the client responds successfully after the 10 second timeout and records a success status while the dbt cloud webhooks system will interpret this as a failure. + +:::tip Videos +If you're interested in course learning with videos, check out the [Webhooks on-demand course](https://courses.getdbt.com/courses/webhooks) from dbt Labs. + +You can also check out the free [dbt Fundamentals course](https://courses.getdbt.com/courses/fundamentals). +::: + +## Prerequisites +- You have a dbt Cloud account that is on the [Team or Enterprise plan](https://www.getdbt.com/pricing/). +- For `write` access to webhooks: + - **Enterprise plan accounts** — Permission sets are the same for both API service tokens and the dbt Cloud UI. You, or the API service token, must have the [Account Admin](/docs/cloud/manage-access/enterprise-permissions#account-admin), [Admin](/docs/cloud/manage-access/enterprise-permissions#admin), or [Developer](/docs/cloud/manage-access/enterprise-permissions#developer) permission set. + - **Team plan accounts** — For the dbt Cloud UI, you need to have a [Developer license](/docs/cloud/manage-access/self-service-permissions). For API service tokens, you must assign the service token to have the [Account Admin or Member](/docs/dbt-cloud-apis/service-tokens#team-plans-using-service-account-tokens) permission set. +- You have a multi-tenant deployment model in dbt Cloud. For more information, refer to [Tenancy](/docs/cloud/about-cloud/tenancy). + +## Create a webhook subscription {#create-a-webhook-subscription} + +From your **Account Settings** in dbt Cloud (using the gear menu in the top right corner), click **Create New Webhook** in the **Webhooks** section. You can find the appropriate dbt Cloud access URL for your region and plan with [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). + +To configure your new webhook: + +- **Name** — Enter a name for your outbound webhook. +- **Description** — Enter a description of the webhook. +- **Events** — Choose the event you want to trigger this webhook. You can subscribe to more than one event. +- **Jobs** — Specify the job(s) you want the webhook to trigger on. Or, you can leave this field empty for the webhook to trigger on all jobs in your account. By default, dbt Cloud configures your webhook at the account level. +- **Endpoint** — Enter your application's endpoint URL, where dbt Cloud can send the event(s) to. + +When done, click **Save**. dbt Cloud provides a secret token that you can use to [check for the authenticity of a webhook](#validate-a-webhook). It’s strongly recommended that you perform this check on your server to protect yourself from fake (spoofed) requests. + +### Differences between completed and errored webhook events {#completed-errored-event-difference} +The `job.run.errored` event is a subset of the `job.run.completed` events. If you subscribe to both, you will receive two notifications when your job encounters an error. However, dbt Cloud triggers the two events at different times: + +- `job.run.completed` — This event only fires once the job’s metadata and artifacts have been ingested and are available from the dbt Cloud Admin and Discovery APIs. +- `job.run.errored` — This event fires immediately so the job’s metadata and artifacts might not have been ingested. This means that information might not be available for you to use. + +If your integration depends on data from the Admin API (such as accessing the logs from the run) or Discovery API (accessing model-by-model statuses), use the `job.run.completed` event and filter on `runStatus` or `runStatusCode`. + +If your integration doesn’t depend on additional data or if improved delivery performance is more important for you, use `job.run.errored` and build your integration to handle API calls that might not return data a short period at first. + + +## Validate a webhook + +You can use the secret token provided by dbt Cloud to validate that webhooks received by your endpoint were actually sent by dbt Cloud. Official webhooks will include the `Authorization` header that contains a SHA256 hash of the request body and uses the secret token as a key. + +An example for verifying the authenticity of the webhook in Python: + +```python +auth_header = request.headers.get('authorization', None) +app_secret = os.environ['MY_DBT_CLOUD_AUTH_TOKEN'].encode('utf-8') +signature = hmac.new(app_secret, request_body, hashlib.sha256).hexdigest() +return signature == auth_header + +``` + +## Inspect HTTP requests +When working with webhooks, it’s good practice to use tools like [RequestBin](https://requestbin.com/) and [Requestly](https://requestly.io/). These tools allow you to inspect your HTML requests, response payloads, and response headers so you can debug and test webhooks before incorporating them into your systems. + +## Examples of JSON payloads + +An example of a webhook payload for a run that's started: + +```json +{ + "accountId": 1, + "webhooksID": "wsu_12345abcde" + "eventId": "wev_2L6Z3l8uPedXKPq9D2nWbPIip7Z", + "timestamp": "2023-01-31T19:28:15.742843678Z", + "eventType": "job.run.started", + "webhookName": "test", + "data": { + "jobId": "123", + "jobName": "Daily Job (dbt build)", + "runId": "12345", + "environmentId": "1234", + "environmentName": "Production", + "dbtVersion": "1.0.0", + "projectName": "Snowflake Github Demo", + "projectId": "167194", + "runStatus": "Running", + "runStatusCode": 3, + "runStatusMessage": "None", + "runReason": "Kicked off from UI by test@test.com", + "runStartedAt": "2023-01-31T19:28:07Z" + } +} +``` + +An example of a webhook payload for a completed run: + +```json +{ + "accountId": 1, + "webhooksID": "wsu_12345abcde" + "eventId": "wev_2L6ZDoilyiWzKkSA59Gmc2d7FDD", + "timestamp": "2023-01-31T19:29:35.789265936Z", + "eventType": "job.run.completed", + "webhookName": "test", + "data": { + "jobId": "123", + "jobName": "Daily Job (dbt build)", + "runId": "12345", + "environmentId": "1234", + "environmentName": "Production", + "dbtVersion": "1.0.0", + "projectName": "Snowflake Github Demo", + "projectId": "167194", + "runStatus": "Success", + "runStatusCode": 10, + "runStatusMessage": "None", + "runReason": "Kicked off from UI by test@test.com", + "runStartedAt": "2023-01-31T19:28:07Z", + "runFinishedAt": "2023-01-31T19:29:32Z" + } +} +``` + +An example of a webhook payload for an errored run: + +```json +{ + "accountId": 1, + "webhooksID": "wsu_12345abcde" + "eventId": "wev_2L6m5BggBw9uPNuSmtg4MUiW4Re", + "timestamp": "2023-01-31T21:15:20.419714619Z", + "eventType": "job.run.errored", + "webhookName": "test", + "data": { + "jobId": "123", + "jobName": "dbt Vault", + "runId": "12345", + "environmentId": "1234", + "environmentName": "dbt Vault Demo", + "dbtVersion": "1.0.0", + "projectName": "Snowflake Github Demo", + "projectId": "167194", + "runStatus": "Errored", + "runStatusCode": 20, + "runStatusMessage": "None", + "runReason": "Kicked off from UI by test@test.com", + "runStartedAt": "2023-01-31T21:14:41Z", + "runErroredAt": "2023-01-31T21:15:20Z" + } +} +``` + +## API for webhooks {#api-for-webhooks} +You can use the dbt Cloud API to create new webhooks that you want to subscribe to, get detailed information about your webhooks, and to manage the webhooks that are associated with your account. The following sections describe the API endpoints you can use for this. + +:::info Access URLs +dbt Cloud is hosted in multiple regions in the world and each region has a different access URL. People on Enterprise plans can choose to have their account hosted in any one of these regions. For a complete list of available dbt Cloud access URLs, refer to [Regions & IP addresses](/docs/cloud/about-cloud/regions-ip-addresses). +::: + +### List all webhook subscriptions +List all webhooks that are available from a specific dbt Cloud account. + +#### Request +```shell +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions +``` + +#### Path parameters +| Name | Description | +|------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhooks are associated with. | + +#### Response sample +```json +{ + "data": [ + { + "id": "wsu_12345abcde", + "account_identifier": "act_12345abcde", + "name": "Webhook for jobs", + "description": "A webhook for when jobs are started", + "job_ids": [ + "123", + "321" + ], + "event_types": [ + "job.run.started" + ], + "client_url": "https://test.com", + "active": true, + "created_at": "1675735768491774", + "updated_at": "1675787482826757", + "account_id": "123", + "http_status_code": "0" + }, + { + "id": "wsu_12345abcde", + "account_identifier": "act_12345abcde", + "name": "Notication Webhook", + "description": "Webhook used to trigger notifications in Slack", + "job_ids": [], + "event_types": [ + "job.run.completed", + "job.run.started", + "job.run.errored" + ], + "client_url": "https://test.com", + "active": true, + "created_at": "1674645300282836", + "updated_at": "1675786085557224", + "http_status_code": "410", + "dispatched_at": "1675786085548538", + "account_id": "123" + } + ], + "status": { + "code": 200 + }, + "extra": { + "pagination": { + "total_count": 2, + "count": 2 + }, + "filters": { + "offset": 0, + "limit": 10 + } + } +} +``` + +#### Response schema +| Name | Description | Possible Values | +| --- | --- | --- | +| `data` | List of available webhooks for the specified dbt Cloud account ID. | | +| `id` | The webhook ID. This is a universally unique identifier (UUID) that's unique across all regions, including multi-tenant and single-tenant | | +| `account_identifier` | The unique identifier for _your_ dbt Cloud account. | | +| `name` | Name of the outbound webhook. | | +| `description` | Description of the webhook. | | +| `job_ids` | The specific jobs the webhook is set to trigger for. When the list is empty, the webhook is set to trigger for all jobs in your account; by default, dbt Cloud configures webhooks at the account level. |
          • Empty list
          • List of job IDs
          | +| `event_types` | The event type(s) the webhook is set to trigger on. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `client_url` | The endpoint URL for an application where dbt Cloud can send event(s) to. | | +| `active` | A Boolean value indicating whether the webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `created_at` | Timestamp of when the webhook was created. | | +| `updated_at` | Timestamp of when the webhook was last updated. | | +| `http_status_code` | The latest HTTP status of the webhook. | Can be any [HTTP response status code](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status). If the value is `0`, that means the webhook has never been triggered. | +| `dispatched_at` | Timestamp of when the webhook was last dispatched to the specified endpoint URL. | | +| `account_id` | The dbt Cloud account ID. | | + +### Get details about a webhook +Get detailed information about a specific webhook. + +#### Request +```shell +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +``` +#### Path parameters +| Name | Description | +|------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhook is associated with. | +| `webhook_id` | The webhook you want detailed information on. | + +#### Response sample +```json +{ + "data": { + "id": "wsu_12345abcde", + "account_identifier": "act_12345abcde", + "name": "Webhook for jobs", + "description": "A webhook for when jobs are started", + "event_types": [ + "job.run.started" + ], + "client_url": "https://test.com", + "active": true, + "created_at": "1675789619690830", + "updated_at": "1675793192536729", + "dispatched_at": "1675793192533160", + "account_id": "123", + "job_ids": [], + "http_status_code": "0" + }, + "status": { + "code": 200 + } +} +``` + +#### Response schema +| Name | Description | Possible Values | +| --- | --- | --- | +| `id` | The webhook ID. | | +| `account_identifier` | The unique identifier for _your_ dbt Cloud account. | | +| `name` | Name of the outbound webhook. | | +| `description` | Complete description of the webhook. | | +| `event_types` | The event type the webhook is set to trigger on. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `client_url` | The endpoint URL for an application where dbt Cloud can send event(s) to. | | +| `active` | A Boolean value indicating whether the webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `created_at` | Timestamp of when the webhook was created. | | +| `updated_at` | Timestamp of when the webhook was last updated. | | +| `dispatched_at` | Timestamp of when the webhook was last dispatched to the specified endpoint URL. | | +| `account_id` | The dbt Cloud account ID. | | +| `job_ids` | The specific jobs the webhook is set to trigger for. When the list is empty, the webhook is set to trigger for all jobs in your account; by default, dbt Cloud configures webhooks at the account level. | One of these:
          • Empty list
          • List of job IDs
          | +| `http_status_code` | The latest HTTP status of the webhook. | Can be any [HTTP response status code](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status). If the value is `0`, that means the webhook has never been triggered. | + +### Create a new webhook subscription +Create a new outbound webhook and specify the endpoint URL that will be subscribing (listening) to the webhook's events. + +#### Request sample + +```shell +POST https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscriptions +``` + +```json +{ + "event_types": [ + "job.run.started" + ], + "name": "Webhook for jobs", + "client_url": "https://test.com", + "active": true, + "description": "A webhook for when jobs are started", + "job_ids": [ + 123, + 321 + ] +} +``` + +#### Path parameters +| Name | Description | +| --- | --- | +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhook is associated with. | + +#### Request parameters +| Name | Description | Possible Values | +| --- | --- | --- | +| `event_types` | Enter the event you want to trigger this webhook. You can subscribe to more than one event. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `name` | Enter the name of your webhook. | | +| `client_url` | Enter your application's endpoint URL, where dbt Cloud can send the event(s) to.| | +| `active` | Enter a Boolean value to indicate whether your webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `description` | Enter a description of your webhook. | | +| `job_ids` | Enter the specific jobs you want the webhook to trigger on or you can leave this parameter as an empty list. If this is an empty list, the webhook is set to trigger for all jobs in your account; by default, dbt Cloud configures webhooks at the account level. | One of these:
          • Empty list
          • List of job IDs
          | + +#### Response sample +```json +{ + "data": { + "id": "wsu_12345abcde", + "account_identifier": "act_12345abcde", + "name": "Webhook for jobs", + "description": "A webhook for when jobs are started", + "job_ids": [ + "123", + "321" + ], + "event_types": [ + "job.run.started" + ], + "client_url": "https://test.com", + "hmac_secret": "12345abcde", + "active": true, + "created_at": "1675795644808877", + "updated_at": "1675795644808877", + "account_id": "123", + "http_status_code": "0" + }, + "status": { + "code": 201 + } +} +``` + +#### Response schema +| Name | Description | Possible Values | +| --- | --- | --- | +| `id` | The webhook ID. | | +| `account_identifier` | The unique identifier for _your_ dbt Cloud account. | | +| `name` | Name of the outbound webhook. | | +| `description` | Complete description of the webhook. | | +| `job_ids` | The specific jobs the webhook is set to trigger for. When the list is empty, the webhook is set to trigger for all jobs in your account; by default, dbt Cloud configures webhooks at the account level. | One of these:
          • Empty list
          • List of job IDs
          | +| `event_types` | The event type the webhook is set to trigger on. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `client_url` | The endpoint URL for an application where dbt Cloud can send event(s) to. | | +| `hmac_secret` | The secret key for your new webhook. You can use this key to [validate the authenticity of this webhook](#validate-a-webhook). | | +| `active` | A Boolean value indicating whether the webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `created_at` | Timestamp of when the webhook was created. | | +| `updated_at` | Timestamp of when the webhook was last updated. | | +| `account_id` | The dbt Cloud account ID. | | +| `http_status_code` | The latest HTTP status of the webhook. | Can be any [HTTP response status code](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status). If the value is `0`, that means the webhook has never been triggered. | + +### Update a webhook +Update the configuration details for a specific webhook. + +#### Request sample +```shell +PUT https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +``` + +```json +{ + "event_types": [ + "job.run.started" + ], + "name": "Webhook for jobs", + "client_url": "https://test.com", + "active": true, + "description": "A webhook for when jobs are started", + "job_ids": [ + 123, + 321 + ] +} +``` + +#### Path parameters +| Name | Description | +|------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhook is associated with. | +| `webhook_id` | The webhook you want to update. | + +#### Request parameters +| Name | Description | Possible Values | +|------|-------------|-----------------| +| `event_types` | Update the event type the webhook is set to trigger on. You can subscribe to more than one. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `name` | Change the name of your webhook. | | +| `client_url` | Update the endpoint URL for an application where dbt Cloud can send event(s) to. | | +| `active` | Change the Boolean value indicating whether the webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `description` | Update the webhook's description. | | +| `job_ids` | Change which jobs you want the webhook to trigger for. Or, you can use an empty list to trigger it for all jobs in your account. | One of these:
          • Empty list
          • List of job IDs
          | + +#### Response sample +```json +{ + "data": { + "id": "wsu_12345abcde", + "account_identifier": "act_12345abcde", + "name": "Webhook for jobs", + "description": "A webhook for when jobs are started", + "job_ids": [ + "123" + ], + "event_types": [ + "job.run.started" + ], + "client_url": "https://test.com", + "active": true, + "created_at": "1675798888416144", + "updated_at": "1675804719037018", + "http_status_code": "200", + "account_id": "123" + }, + "status": { + "code": 200 + } +} +``` + +#### Response schema +| Name | Description | Possible Values | +| --- | --- | --- | +| `id` | The webhook ID. | | +| `account_identifier` | The unique identifier for _your_ dbt Cloud account. | | +| `name` | Name of the outbound webhook. | | +| `description` | Complete description of the webhook. | | +| `job_ids` | The specific jobs the webhook is set to trigger for. When the list is empty, the webhook is set to trigger for all jobs in your account; by default, dbt Cloud configures webhooks at the account level. | One of these:
          • Empty list
          • List of job IDs
          | +| `event_types` | The event type the webhook is set to trigger on. | One or more of these:
          • `job.run.started`
          • `job.run.completed`
          • `job.run.errored`
          | +| `client_url` | The endpoint URL for an application where dbt Cloud can send event(s) to. | | +| `active` | A Boolean value indicating whether the webhook is active or not. | One of these:
          • `true`
          • `false`
          | +| `created_at` | Timestamp of when the webhook was created. | | +| `updated_at` | Timestamp of when the webhook was last updated. | | +| `http_status_code` | The latest HTTP status of the webhook. | Can be any [HTTP response status code](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status). If the value is `0`, that means the webhook has never been triggered. | +| `account_id` | The dbt Cloud account ID. | | + + +### Test a webhook +Test a specific webhook. + +#### Request +```shell +GET https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id}/test +``` + +#### Path parameters +| Name | Description | +|------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhook is associated with. | +| `webhook_id` | The webhook you want to test. | + +#### Response sample +```json +{ + "data": { + "verification_error": null, + "verification_status_code": "200" + }, + "status": { + "code": 200 + } +} +``` + +### Delete a webhook +Delete a specific webhook. + +#### Request +```shell +DELETE https://{your access URL}/api/v3/accounts/{account_id}/webhooks/subscription/{webhook_id} +``` + +#### Path parameters +| Name | Description | +|------------|--------------------------------------| +| `your access URL` | The login URL for your dbt Cloud account. | +| `account_id` | The dbt Cloud account the webhook is associated with. | +| `webhook_id` | The webhook you want to delete. | + +#### Response sample + +```json +{ + "data": { + "id": "wsu_12345abcde" + }, + "status": { + "code": 200, + "is_success": true + } +} +``` + +## Related docs +- [dbt Cloud CI](/docs/deploy/continuous-integration) +- [Use dbt Cloud's webhooks with other SaaS apps](/guides/orchestration/webhooks) + diff --git a/website/docs/docs/environments-in-dbt.md b/website/docs/docs/environments-in-dbt.md new file mode 100644 index 00000000000..70bc096cf4f --- /dev/null +++ b/website/docs/docs/environments-in-dbt.md @@ -0,0 +1,40 @@ +--- +title: "About environments" +id: "environments-in-dbt" +hide_table_of_contents: true +pagination_next: null +--- + +In software engineering, environments are used to enable engineers to develop and test code without impacting the users of their software. Typically, there are two types of environments in dbt: + +- **Deployment or Production** (or _prod_) — Refers to the environment that end users interact with. + +- **Development** (or _dev_) — Refers to the environment that engineers work in. This means that engineers can work iteratively when writing and testing new code in _development_. Once they are confident in these changes, they can deploy their code to _production_. + +In traditional software engineering, different environments often use completely separate architecture. For example, the dev and prod versions of a website may use different servers and databases. Data warehouses can also be designed to have separate environments — the _production_ environment refers to the relations (for example, schemas, tables, and views) that your end users query (often through a BI tool). + +Configure environments to tell dbt Cloud or dbt Core how to build and execute your project in development and production: + +
          + + + + + +

          + +## Related docs + +- [dbt Cloud environment best practices](https://docs.getdbt.com/guides/best-practices/environment-setup/1-env-guide-overview) +- [Deployment environments](/docs/deploy/deploy-environments) +- [About dbt Core versions](/docs/dbt-versions/core) +- [Set Environment variables in dbt Cloud](/docs/build/environment-variables#special-environment-variables) +- [Use Environment variables in jinja](/reference/dbt-jinja-functions/env_var) diff --git a/website/docs/docs/get-started/about-the-cli.md b/website/docs/docs/get-started/about-the-cli.md deleted file mode 100644 index cc3639d933f..00000000000 --- a/website/docs/docs/get-started/about-the-cli.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: "About the CLI" -id: "about-the-cli" ---- - -dbt ships with a Command Line Interface (CLI) for running your dbt project. This way of running dbt a dbt project is free and open source. - -To use the CLI, your workflow generally looks like: -* **Build your dbt project in a code editor:** popular choices include VSCode and Atom. -* **Run your project from the command line:** - * macOS ships with a default Terminal program, however you can also use iTerm or the command line prompt within a code editor to execute dbt commands - -The CLI is also available for dbt Cloud. Additional components must be installed for the CLI to communicate via dbt Cloud APIs. For more information, visit the [dbt Cloud CLI GitHub repository](https://github.com/data-mie/dbt-cloud-cli) - -:::info How we set up our computers for working on dbt projects - -We've written a [guide](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) for our recommended setup when running dbt projects using the CLI. - -::: - -If you're using the CLI, we recommend learning some basics of your terminal to help you work more effectively. In particular, it's important to understand `cd`, `ls` and `pwd` to be able to navigate through the directory structure of your computer easily. - -You can find more information on installing and setting up the dbt CLI [here](dbt-cli/cli-overview). diff --git a/website/docs/docs/get-started/connect-your-database.md b/website/docs/docs/get-started/connect-your-database.md deleted file mode 100644 index 656288be68a..00000000000 --- a/website/docs/docs/get-started/connect-your-database.md +++ /dev/null @@ -1,228 +0,0 @@ ---- -title: "Connect your database" -id: "connect-your-database" ---- - -You can connect to your database in dbt Cloud by clicking the gear in the top right and then selecting **Account Settings**. From the Account Settings page, click **+ New Project**. - -## IP Restrictions - -dbt Cloud will always connect to your data platform from the IP addresses specified in the [Regions](/docs/deploy/regions) page. - -Be sure to allow traffic from these IPs in your firewall, and include them in any database grants. - - - -- November 2020 — add the IPs `54.81.134.249` and `52.22.161.231` -- September 2022 — Add EMEA IPs - - - -Allowing these IP addresses only enables the connection to your . However, you might want to send API requests from your restricted network to the dbt Cloud API. For example, you could use the API to send a POST request that [triggers a job to run](https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun). Using the dbt Cloud API requires that you allow the `cloud.getdbt.com` subdomain. For more on the dbt Cloud architecture, see [Deployment architecture](/docs/deploy/architecture). - - -## Connecting to Postgres, Redshift, and AlloyDB - -The following fields are required when creating a Postgres, Redshift, or AlloyDB connection: - -| Field | Description | Examples | -| ----- | ----------- | -------- | -| Host Name | The hostname of the Postgres, Redshift, or AlloyDB database to connect to. This can either be a hostname or an IP address. | `xxx.us-east-1.amazonaws.com` | -| Port | Usually 5432 (Postgres) or 5439 (Redshift) | `5439` | -| Database | The logical database to connect to and run queries against. | `analytics` | - -**Note**: When you set up a Redshift or Postgres connection in dbt Cloud, SSL-related parameters aren't available as inputs. - - - -### Connecting via an SSH Tunnel - -To connect to a Postgres, Redshift, or AlloyDB instance via an SSH tunnel, select the **Use SSH Tunnel** option when creating your connection. When configuring the tunnel, you must supply the hostname, username, and port for the bastion server. - -Once the connection is saved, a public key will be generated and displayed for the Connection. You can copy this public key to the bastion server to authorize dbt Cloud to connect to your database via the bastion server. - - - -## Connecting to Snowflake - -The following fields are required when creating a Snowflake connection: - -| Field | Description | Examples | -| ----- | ----------- | -------- | -| Account | The Snowflake account to connect to. Take a look [here](/reference/warehouse-setups/snowflake-setup#account) to determine what the account field should look like based on your region.| | -| Role | A mandatory field indicating what role should be assumed after connecting to Snowflake | `transformer` | -| Database | The logical database to connect to and run queries against. | `analytics` | -| Warehouse | The virtual warehouse to use for running queries. | `transforming` | - - -**Note:** A crucial part of working with dbt atop Snowflake is ensuring that users (in development environments) and/or service accounts (in deployment to production environments) have the correct permissions to take actions on Snowflake! Here is documentation of some [example permissions to configure Snowflake access](snowflake-permissions). - -### Username / Password - -**Available in:** Development environments, Deployment environments - -The `Username / Password` auth method is the simplest way to authenticate -Development or Deployment credentials in a dbt project. Simply enter your Snowflake -username (specifically, the `login_name`) and the corresponding user's Snowflake `password` -to authenticate dbt Cloud to run queries against Snowflake on behalf of a Snowflake user. - -**Note**: The schema field in the **Developer Credentials** section is a required field. - - -### Key Pair -**Available in:** Development environments, Deployment environments - -The `Keypair` auth method uses Snowflake's [Key Pair Authentication](https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication) to authenticate Development or Deployment credentials for a dbt Cloud project. - -After [generating an encrypted key pair](https://docs.snowflake.com/en/user-guide/key-pair-auth.html#configuring-key-pair-authentication), be sure to set the `rsa_public_key` for the Snowflake user to authenticate in dbt Cloud: - -```sql -alter user jsmith set rsa_public_key='MIIBIjANBgkqh...'; -``` - -Finally, set the "Private Key" and "Private Key Passphrase" fields in the "Edit -Credentials" page to finish configuring dbt Cloud to authenticate with Snowflake -using a key pair. - -**Note:** At this time ONLY Encrypted Private Keys are supported by dbt Cloud, and the keys must be of size 4096 or smaller. - -In order to successfully fill in the Private Key field, you **must** include the commented lines below when you add the passphrase. Leaving the `PRIVATE KEY PASSPHRASE` field empty will return an error - have a look at the examples below: - -**Example:** -```sql ------BEGIN ENCRYPTED PRIVATE KEY----- -< encrypted private key contents here > ------END ENCRYPTED PRIVATE KEY----- -``` - - -### Snowflake OAuth - -**Available in:** Development environments, Enterprise plans only - -The OAuth auth method permits dbt Cloud to run development queries on behalf of -a Snowflake user without the configuration of Snowflake password in dbt Cloud. For -more information on configuring a Snowflake OAuth connection in dbt Cloud, please see [the docs on setting up Snowflake OAuth](/docs/collaborate/manage-access/set-up-snowflake-oauth). - - -## Connecting to BigQuery - -### JSON keyfile - -:::info Uploading a service account JSON keyfile - -While the fields in a BigQuery connection can be specified manually, we recommend uploading a service account keyfile to quickly and accurately configure a connection to BigQuery. - -::: - -Uploading a JSON keyfile should populate the following fields: -- Project id -- Private key id -- Private key -- Client email -- Client id -- Auth uri -- Token uri -- Auth provider x509 cert url -- Client x509 cert url - -In addition to these fields, there are two other optional fields that can be configured in a BigQuery connection: - -| Field | Description | Examples | -| ----- | ----------- | ------- | -| Timeout | Deprecated; exists for backwards compatibility with older versions of dbt and will be removed in the future. | `300` | -| Location | The [location](https://cloud.google.com/bigquery/docs/locations) where dbt should create datasets. | `US`, `EU` | - - - - - -### BigQuery OAuth -**Available in:** Development environments, Enterprise plans only - -The OAuth auth method permits dbt Cloud to run development queries on behalf of -a BigQuery user without the configuration of BigQuery service account keyfile in dbt Cloud. For -more information on the initial configuration of a BigQuery OAuth connection in dbt Cloud, please see -[the docs on setting up BigQuery OAuth](/docs/collaborate/manage-access/set-up-bigquery-oauth). - -As an end user, if your organization has set up BigQuery OAuth, you can link a project with your personal BigQuery account in your personal Profile in dbt Cloud, like so: - - -## Connecting to Databricks -You can connect to Databricks by using one of two supported adapters: [dbt-databricks](/connect-your-database#dbt-databricks) and [dbt-spark](/connect-your-database#dbt-spark). For accounts on dbt 1.0 or later, we recommend using the dbt-databricks adapter. The dbt-databricks adapter is maintained by the Databricks team and is verified by dbt Labs. The Databricks team is committed to supporting and improving the adapter over time, so you can be sure the integrated experience will provide the best of dbt and the best of Databricks. Connecting to Databricks via dbt-spark will be deprecated in the future. - -### dbt-databricks Adapter -dbt-databricks is compatible with the following versions of dbt Core in dbt Cloud with varying degrees of functionality. - -| Feature | dbt Versions | -| ----- | ----------- | -| dbt-databricks | Available starting with dbt 1.0 in dbt Cloud| -| Unity Catalog | Available starting with dbt 1.1 | -| Python models | Available starting with dbt 1.3 | - -The dbt-databricks adapter offers: -- **Easier set up** -- **Better defaults:** -The dbt-databricks adapter is more opinionated, guiding users to an improved experience with less effort. Design choices of this adapter include defaulting to Delta format, using merge for incremental models, and running expensive queries with Photon. -- **Support for Unity Catalog:** -Unity Catalog allows Databricks users to centrally manage all data assets, simplifying access management and improving search and query performance. Databricks users can now get three-part data hierarchies – catalog, schema, model name – which solves a longstanding friction point in data organization and governance. - - -To set up the Databricks connection, supply the following fields: - -| Field | Description | Examples | -| ----- | ----------- | -------- | -| Server Hostname | The hostname of the Databricks account to connect to | dbc-a2c61234-1234.cloud.databricks.com | -| HTTP Path | The HTTP path of the Databricks cluster or SQL warehouse | /sql/1.0/warehouses/1a23b4596cd7e8fg | -| Catalog | Name of Databricks Catalog (optional) | Production | - - - -### dbt-spark Adapter - -dbt Cloud supports connecting to Databricks using -[a Cluster](https://docs.databricks.com/clusters/index.html) or -[a SQL Warehouse (formerly called SQL endpoints)](https://docs.databricks.com/sql/admin/sql-endpoints.html). -Depending on how you connect to Databricks, either one of the `Cluster` or -`Endpoint` configurations must be provided, but setting _both_ values is not -allowed. - -The following fields are available when creating a Databricks connection: - -| Field | Description | Examples | -| ----- | ----------- | -------- | -| Hostname | The hostname of the Databricks account to connect to | dbc-a2c61234-1234.cloud.databricks.com | -| Port | The port to connect to Databricks for this connection | 443 | -| Organization | Optional (default: 0) | 1123456677899012 | -| Cluster | The ID of the cluster to connect to (required if using a cluster) | 1234-567890-abc12345 | -| Endpoint | The ID of the endpoint to connect to (required if using Databricks SQL) | 1a23b4596cd7e8fg | -| User | Optional | dbt_cloud_user | - - - -## Connecting to Apache Spark - -### HTTP and Thrift - -dbt Cloud supports connecting to an Apache Spark cluster using the HTTP method -or the Thrift method. Note: While the HTTP method can be used to connect to -an all-purpose Databricks cluster, the ODBC method is recommended for all -Databricks connections. For further details on configuring these connection -parameters, please see the [dbt-spark documentation](https://github.com/dbt-labs/dbt-spark#configuring-your-profile) - -The following fields are available when creating an Apache Spark connection using the -HTTP and Thrift connection methods: - -| Field | Description | Examples | -| ----- | ----------- | -------- | -| Host Name | The hostname of the Spark cluster to connect to | `yourorg.sparkhost.com` | -| Port | The port to connect to Spark on | 443 | -| Organization | Optional (default: 0) | 0123456789 | -| Cluster | The ID of the cluster to connect to | 1234-567890-abc12345 | -| Connection Timeout | Number of seconds after which to timeout a connection | 10 | -| Connection Retries | Number of times to attempt connecting to cluster before failing | 10 | -| User | Optional | dbt_cloud_user | -| Auth | Optional, supply if using Kerberos | `KERBEROS` | -| Kerberos Service Name | Optional, supply if using Kerberos | `hive` | - - diff --git a/website/docs/docs/get-started/dbt-cloud-features.md b/website/docs/docs/get-started/dbt-cloud-features.md deleted file mode 100644 index c5963e53c4f..00000000000 --- a/website/docs/docs/get-started/dbt-cloud-features.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: "dbt Cloud features" -id: "dbt-cloud-features" ---- - - -## dbt Cloud IDE - - - -The dbt Cloud [integrated development environment (IDE)](/docs/get-started/develop-in-the-cloud) allows you to build, test, run, and version control your dbt projects directly from your browser. The IDE is the fastest and most reliable way to deploy dbt, and provides a real-time editing and execution environment for your dbt project — no command line required. Anyone can use the IDE, from new dbt developers to seasoned practitioners. - -With the Cloud IDE, you can: - -- Write modular SQL models with `select` statements and the [ref()](/reference/dbt-jinja-functions/ref) function -- Compile dbt code into SQL and execute it against your database directly -- Test every model before deploying them to production -- Generate and view documentation of your dbt project -- Leverage [git and version-control your code](/docs/collaborate/git/version-control-basics) from your browser with a couple of clicks -- Create and test Python models: - * You can `compile` Python models to see the full function that gets executed in your data platform - * You can see Python models in DAG in dbt version 1.3 and higher - * You can't preview python models, currently -- Visualize a directed acyclic graph (DAG), [and more](/docs/get-started/dbt-cloud-tips) - -## IDE features -The dbt Cloud IDE comes with features, including better performance and exciting enhancements, making it easier for you to develop, build, compile, run and test data models. Check out the some of the features below to learn more: - - -| Feature | Info | -|---|---| -| **File state indicators** | Ability to see when changes or actions have been made to the file. The indicators **M, U,** and **•** appear to the right of your file or folder name and indicate the actions performed:

          - Unsaved **(•)** — The IDE detects unsaved changes to your file/folder
          - Modification **(M)** — The IDE detects a modification of existing files/folders
          - Untracked **(U)** — The IDE detects changes made to new files or renamed files -| **Build, test, and run code** | Build, test, and run your project with a button click or by using the Cloud IDE command bar. -| **Drag and drop** | Drag and drop files located in the file explorer, and use the file breadcrumb on the top of the IDE for quick, linear navigation. Access adjacent files in the same file by right clicking on the breadcrumb file. -| **Organize tabs** | You can:
          - Move your tabs around to reorganize your work in the IDE
          - Right-click on a tab to view and select a list of actions to take
          - Close multiple, unsaved tabs to batch save your work -| **Multiple selections** | You can make multiple selections for small and simultaneous edits. The below commands are a common way to add more cursors and allow you to insert cursors below or above with ease.

          - Option-Command-Down arrow
          - Option-Command-Up arrow
          - Press Option and click on an area -| **Formatting** | Format your files with a click of a button, powered by [sqlfmt](http://sqlfmt.com/). -| **Git diff view** | Ability to see what has been changed in a file before you make a pull request. -| **dbt autocomplete** | There are four new types of autocomplete features to help you develop faster:
          - Use `ref` to autocomplete your model names
          - Use `source` to autocomplete your source name + table name
          - Use `macro` to autocomplete your arguments
          - Use `env var` to autocomplete env var -| **Dark mode** | Use dark mode in the Cloud IDE for a great viewing experience in low-light environments. - - -## Related docs - -- [dbt Cloud tips](/docs/get-started/dbt-cloud-tips) -- [Develop in the Cloud](docs/get-started/develop-in-the-cloud) -- [Guides](/docs/get-started/getting-started/overview) diff --git a/website/docs/docs/get-started/dbt-cloud-tips.md b/website/docs/docs/get-started/dbt-cloud-tips.md deleted file mode 100644 index 687c6c97643..00000000000 --- a/website/docs/docs/get-started/dbt-cloud-tips.md +++ /dev/null @@ -1,95 +0,0 @@ ---- -title: "dbt Cloud tips" -id: dbt-cloud-tips -description: "Check out any dbt Cloud and IDE-related tips." -sidebar_label: "dbt Cloud tips" ---- - -# dbt Cloud tips - -The Cloud IDE provides keyboard shortcuts, features, and development tips to help you work faster and be more productive. Use this Cloud IDE cheat sheet to help you quickly reference some common operations. - -## Most popular - -**Organize tabs** - -You can move your tabs around to reorganize your work in the IDE. You can also right-click on a tab to close multiple tabs or view and select a list of actions to take. Right-click in the editor to access other options, including text wrap. - -**Drag and drop** - -You can also drag and drop files in the file tree or in the editor. Use the file breadcrumb on the top of the IDE for quicker navigation. You can access adjacent files in the same file by right-clicking on the breadcrumb file. - -**Search across files** - -You can quickly search over all files in the IDE on your current project. To search, open the search bar by pressing Command-O to find text across all files in your current project and write your filename. You can view the results under the search text, which are grouped into files containing the match. You can click on the results to view them in the IDE. - -**Command bar and status** - -You can run commands from the command bar at the bottom of the IDE or by using the **Build** button. Use the [rich model selection syntax](/docs/reference/node-selection/syntax) to run [dbt commands](/docs/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking **Runs**. - -The status icon on the lower right corner of the IDE gives you an indicator of the health of your project. You can identify errors by clicking on the status icon for more details or by clicking **Restart the IDE**. - -**Find and replace** - -Press Command-F or Control-F to open the find-and-replace bar in the upper right corner of the current file in the IDE. The IDE highlights your search results in the current file and code outline. You can use the up and down arrows to see the match highlighted in the current file when there are multiple matches. To replace the text with something else, use the left arrow. - -**DAG in the IDE** - -Double-click a node in the directed acyclic graph (DAG) to open that file in a new tab. Click **Expand** on the upper right side of the DAG and use node selection syntax (`select` or `exclude`) to view a subset of your DAG. - -## IDE Keyboard shortcuts - -There are default keyboard shortcuts that can help make development more productive and easier for everyone. - -- Command-O or Control-O to select a file to open -- Command-P or Control-P to see command palette -- Hold Option-click-on-an-area to select multiple lines and perform a multi-edit. You can also press Command-E to perform this operation on the command line. -- Press Fn-F1 to view a list of the other editor shortcuts -- Command-Enter or Control-Enter to Preview your code -- Command-Shift-Enter or Control-Shift-Enter to Compile -- Highlight a portion of code and use the above shortcuts to Preview or Compile code -- Enter two underscores (__) in the IDE to reveal a list of dbt functions - -## Multiple selections - -You can make multiple selections for small and simultaneous edits. The below commands are a common way to add more cursors and allow you to insert cursors below or above with ease. - -- Option-Command-Down arrow -- Option-Command-Up arrow -- Press Option and click on an area - -## Package tips - -- Use the [dbt_codegen](https://hub.getdbt.com/dbt-labs/codegen/latest/) package to help you generate YML files for your models and sources and SQL files for your staging models. -- The [dbt_utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) package contains macros useful for daily development. For example, `date_spine` generates a table with all dates between the ones provided as parameters. -- The [dbt_project_evaluator](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest) package compares your dbt project against a list of our best practices and provides suggestions and guidelines on how to update your models. -- The [dbt_expectations](https://hub.getdbt.com/calogica/dbt_expectations/latest) package contains many tests beyond those built into dbt Core. -- The [dbt_audit_helper](https://hub.getdbt.com/#:~:text=adwords-,audit_helper,-codegen) package lets you compare the output of 2 queries. Use it when refactoring existing logic to ensure that the new results are identical. -- The [dbt_artifacts](https://hub.getdbt.com/brooklyn-data/dbt_artifacts/latest) package saves information about your dbt runs directly to your data platform so that you can track the performance of models over time. -- The [dbt_meta_testing](https://hub.getdbt.com/tnightengale/dbt_meta_testing/latest) package checks that your dbt project is sufficiently tested and documented. - -## Advanced tips - -- Use your folder structure as your primary selector method. `dbt build --select marts.marketing` is simpler and more resilient than relying on tagging every model. -- Think about jobs in terms of build cadences and SLAs. Run models that have hourly, daily, or weekly build cadences together. -- Use the [where config](/docs/reference/resource-configs/where) for tests to test an assertion on a subset of records. -- [store_failures](/docs/reference/resource-configs/store_failures) lets you examine records that cause tests to fail, so you can either repair the data or change the test as needed. -- Use [severity](/docs/reference/resource-configs/severity) thresholds to set an acceptable number of failures for a test. -- Use [incremental_strategy](/docs/build/incremental-models#about-incremental_strategy) in your incremental model config to implement the most effective behavior depending on the volume of your data and reliability of your unique keys. -- Set `vars` in your `dbt_project.yml` to define global defaults for certain conditions, which you can then override using the `--vars` flag in your commands. -- Use [for loops](/docs/get-started/learning-more/using-jinja#use-a-for-loop-in-models-for-repeated-sql) in Jinja to [DRY](https://docs.getdbt.com/terms/dry) up repetitive logic, such as selecting a series of columns that all require the same transformations and naming patterns to be applied. -- Instead of relying on post-hooks, use the [grants config](/docs/reference/resource-configs/grants) to apply permission grants in the warehouse resiliently. -- Define [source-freshness](/docs/build/sources#snapshotting-source-data-freshness) thresholds on your sources to avoid running transformations on data that has already been processed. -- Use the `+` operator on the left of a model `dbt build --select +model_name` to run a model and all of its upstream dependencies. Use the `+` operator on the right of the model `dbt build --select model_name+` to run a model and everything downstream that depends on it. -- Use `dir_name` to run all models in a package or directory. -- Use the `@` operator on the left of a model in a non-state-aware CI setup to test it. This operator runs all of a selection’s parents and children, and also runs the parents of its children, which in a fresh CI schema will likely not exist yet. -- Use the [--exclude flag](/docs/reference/node-selection/exclude) to remove a subset of models out of a selection. -- Use [state and deferral](/docs/deploy/cloud-ci-job#deferral-and-state-comparison) to create a slim CI setup. -- Use the [--full-refresh](/docs/reference/commands/run#refresh-incremental-models) flag to rebuild an incremental model from scratch. -- Use [seeds](/docs/build/seeds) to create manual lookup tables, like zip codes to states or marketing UTMs to campaigns. `dbt seed` will build these from CSVs into your warehouse and make them `ref` able in your models. -- Use [target.name](/docs/build/custom-schemas#an-alternative-pattern-for-generating-schema-names) to pivot logic based on what environment you’re using. For example, to build into a single development schema while developing, but use multiple schemas in production. - -## Related docs - -- [Getting started](/docs/get-started/getting-started/overview) -- [Develop in the Cloud](/docs/get-started/develop-in-the-cloud) diff --git a/website/docs/docs/get-started/develop-in-the-cloud.md b/website/docs/docs/get-started/develop-in-the-cloud.md deleted file mode 100644 index a29e18082ed..00000000000 --- a/website/docs/docs/get-started/develop-in-the-cloud.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: "Develop in the Cloud" -id: develop-in-the-cloud -description: "Develop, test, run, and build in the Cloud IDE." -sidebar_label: Develop in the Cloud ---- - - - -The dbt Cloud integrated development environment (IDE) is a single interface for building, testing, running, and version-controlling dbt projects from your browser. With the Cloud IDE, you can compile dbt code into SQL and run it against your database directly. - -The IDE leverages the open-source [dbt-rpc](/reference/commands/rpc) plugin to recompile only the changes made in your project. - -## Prerequisites - -To develop in the Cloud IDE, make sure you have the following: - -- Your dbt project must be compatible with dbt version 0.15.0 or higher. The dbt IDE is powered by the [dbt-rpc](/reference/commands/rpc) which was overhauled in dbt v0.15.0 -- You must have a [dbt Cloud account](https://cloud.getdbt.com/) and [Developer seat license](/docs/collaborate/manage-access/seats-and-users) -- You must have a git repository set up and your git provider must have `write` access enabled. See [Connecting your GitHub Account](/docs/collaborate/git/connect-github) and [Importing a project by git URL](/docs/collaborate/git/import-a-project-by-git-url) for detailed setup instructions -- Your dbt project must be connected to a [data platform](/docs/get-started/connect-your-database) -- You must have a [**development environment** and **development credentials**](#set-up-and-access-the-cloud-ide) set up -- The environment must be on dbt version 1.0 or higher - -### Start up and work retention in the IDE - - - -
          Start up process Work retention
          - -There are three start-up states when using or launching the Cloud IDE: - -- Creation start — This is the state where you are starting the IDE for the first time. You can also view this as a *cold start* (see below), and you can expect this state to take longer because the git repository is being cloned. -- Cold start — This is the process of starting a new develop session, which will be available for you for three hours. The environment automatically turns off three hours after the last activity with the rpc server. This includes compile, preview, or any dbt invocation, however, it *does not* include editing and saving a file. -- Hot start — This is the state of resuming an existing or active develop session within three hours of the last activity. - - - - - -The Cloud IDE needs explicit action to save your changes. There are three ways your work is stored: - -- Unsaved, local code — Any code you write is automatically available from your browser’s storage. You can see your changes but will lose them if you switch branches or browsers (another device or browser). -- Saved but uncommitted code — When you save a file, the data gets stored in your local storage (EFS storage). If you switch branches but don’t *commit* your saved changes, you will lose your changes. -- Committed code — This is stored in the branch with your git provider and you can check out other (remote) branches. - - -
          - -## Set up and access the Cloud IDE - -:::info📌 - -New to dbt? Check out our [Getting Started guide](/docs/get-started/getting-started/overview) to build your first dbt project in the Cloud IDE! - -::: - -In order to start experiencing the great features of the Cloud IDE, you need to first set up your **Development environment** and **Development credentials.** - -If you’re new to dbt, you will automatically add this during the project setup. However, if you have an existing dbt Cloud account, you may need to create a development environment and credentials manually to use the Cloud IDE. - -Review the steps below to set up your development environment and credentials: - - -**Development environment** - -1. Create a development environment and choose **Deploy** and then **Environments** from the top left. Click **Create Environment**. - - - -2. Enter an environment name that would help you identify it among your other environments (for example, `Nate's Development Environment`). -3. Choose **Development** as the **Environment Type**. -4. You can also select which **dbt Version** to use at this time. For compatibility reasons, we recommend that you select the same dbt version that you plan to use in your deployment environment. -5. Click **Save** to finish creating your **Development environment**. - - - - - -**Developer credentials** - -The IDE uses developer credentials to connect to your data platform. These developer credentials should be specific to your user and they should *not* be super user credentials or the same credentials that you use for your production deployment of dbt. - -Follow the below steps to set up your developer credentials: - -1. Go to the [**Credentials**](https://cloud.getdbt.com/next/settings/profile#credentials) section. - -2. Select the relevant project in the list. - -3. Click **Edit** on the bottom right of the page - -4. Enter your developer credentials and then click **Save.** - -Great job, you should now be able to access the Cloud IDE by clicking **Develop** on the navigation to start developing! - - - -### Access the Cloud IDE - -Now that you've set up your development environment and credentails, you should be able to access the Cloud IDE: - -1. Log in with your [dbt Cloud account](https://cloud.getdbt.com/). If you don't have one, [sign up](https://www.getdbt.com/signup/) for an account for free. -2. Click **Develop** at the top of the page -3. Make sure you've already initialized your project -4. Start developing and use the image and guide below to familiarize yourself with the Cloud IDE and its [features](/docs/get-started/dbt-cloud-features#ide-features): - - - -| Number | Feature | Info | -|---|---|---| -| 1. | File Tree | The file tree allows you to organize your project and manage your files and folders. Click the three-dot menu associated with the file or folder to create, rename, or delete it. Note: This function is unavailable if you’re on the **main** branch. | -| 2. | Editor | This is where you edit your files. You can use the tab for each editor to position it exactly where you need it. | -| 3. | IDE git button | The git button in the IDE allows you to apply the concept of [version control](/docs/collaborate/git/version-control-basics) to your project and you can execute git commands directly in the IDE. | -| 4. | Command bar | You can enter and run commands from the command bar at the bottom of the IDE. Use the [rich model selection syntax](/reference/node-selection/syntax) to execute [dbt commands](/reference/dbt-commands) directly within dbt Cloud. You can also view the history, status, and logs of previous runs by clicking **History** on the left of the bar. -| 5. | Status bar | This area provides you with useful information about your IDE and project status. You also have additional options like restarting or [recloning your repo](/docs/collaborate/git/version-control-basics).| -| 6. | Preview

          Compile

          Build | This is where you can preview, compile or build your dbt project, as well as see the results and the DAG. | -| 7. | Lineage tab | You can see how models are used as building blocks from left to right to transform your data from raw sources into cleaned-up modular derived pieces and final outputs on the far right of the DAG. You can access files in the **Lineage** tab by double-clicking on a particular model. Expand the DAG into fullscreen to view the DAG view differently. Note: The default view is `+model+`, however, you can change it to `2+model+2`. | -| 8. | Change branches and view documentation | Change branches in fewer clicks and focus on developing. You can generate and view your [documentation](/docs/collaborate/build-and-view-your-docs) for your dbt project in real time. You can inspect and verify what your project's documentation will look like before you deploy your changes to production.| -| 9. | File state indicators | The file state indicators will indicate and track any action or changed made in your project. The indicators **M, U, and •** appear to the right of your file or folder name, and also under the **Changes** section. | -| 10. | Format button | This is where you can format your dbt project code. The new **Format** button formats your file and is powered by [sqlfmt](http://sqlfmt.com/).| - -## Build, compile, and run projects - -You can *build*, *compile*, *run* *, and test* dbt projects by using the command bar. The Cloud IDE will update in real time when you run models, tests, seeds, and operations. - -If a model or test fails, you can review the logs to find and fix the issue. - -You can also use dbt's [rich model selection syntax](/reference/node-selection/syntax) to [run dbt commands](/reference/dbt-commands) directly within dbt Cloud. - - - - - -## Build and view your project's docs - -The dbt Cloud IDE makes it possible to view documentation for your dbt project while your code is still in development. With this workflow, you can inspect and verify what your project's generated documentation will look like before your changes are released to production. - -To generate your project’s documentation (docs) in the IDE, run `dbt docs generate` in the command bar. This command generates the docs for your dbt project as it currently exists in development. - -After you generate a successful run, you can view your documentation for your dbt project in real time by clicking **View Docs** or the book icon above the file tree. - -You can view the latest version of your documentation rendered in a new browser window, and inspect and verify what your project's documentation will look like before you deploy your changes to production. - - -## Related docs - -- [What is dbt?](/docs/introduction#dbt-features) -- [dbt Learn courses](https://courses.getdbt.com/collections) -- [dbt Cloud features](/docs/get-started/dbt-cloud-features) -- [Version control basics](/docs/collaborate/git/version-control-basics) -- [dbt Commands](/reference/dbt-commands) - - -## Related questions - -
          - Is there a cost to using the Cloud IDE? -
          -
          Not at all! You can use dbt Cloud when you sign up for the Free Developer plan, which comes with one developer seat. If you’d like to access more features or have more developer seats, you can upgrade your account to the Team or Enterprise plan. See dbt pricing plans for more details.
          -
          -
          -
          - Can I be a contributor to dbt Cloud? -
          -
          Anyone can contribute to the dbt project. And whether it's a dbt package, a plugin, dbt-core, or this documentation site, contributing to the open source code that supports the dbt ecosystem is a great way to level yourself up as a developer, and give back to the community. See Contributing for details on what to expect when contributing to the dbt open source software (OSS).
          -
          -
          -
          - What is the difference between developing on the Cloud IDE and on the CLI? -
          -
          There are two main ways to develop with dbt: using the web-based IDE in dbt Cloud or using the command-line interface (CLI) in dbt Core:

          - - dbt Cloud IDE dbt Cloud is a web-based application that allows you to develop dbt projects with the IDE, includes a purpose-built scheduler, and provides an easier way to share your dbt documentation with your team. The IDE is a faster and more reliable way to deploy your dbt models and provides a real-time editing and execution environment for your dbt project.

          - - dbt Core CLI The command line interface (CLI) uses dbt Core, an open-source software that’s freely available. You can build your dbt project in a code editor, like Jetbrains or VSCode, and run dbt commands from the command line. -
          -
          -
          -
          - What type of support is provided with dbt Cloud? -
          -
          The global dbt Support team is available to help dbt Cloud users by email or in-product live chat. Developer and Team accounts offer 24x5 support, while Enterprise customers have priority access and options for custom coverage.

          If you have project-related or modeling questions, review our Support page or dbt Community Slack to get help as well.
          -
          -
          diff --git a/website/docs/docs/get-started/getting-started-dbt-core.md b/website/docs/docs/get-started/getting-started-dbt-core.md deleted file mode 100644 index 11c0666b44d..00000000000 --- a/website/docs/docs/get-started/getting-started-dbt-core.md +++ /dev/null @@ -1,261 +0,0 @@ ---- -title: "Getting started with dbt Core" -id: getting-started-dbt-core -description: "Connecting your warehouse to dbt Core using the CLI." -sidebar_label: "Getting started" ---- - -When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using the dbt command line interface (dbt CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to [Getting set up in dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud). - -## Prerequisites - -* To use the dbt CLI, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily. -* Install dbt Core using the [installation instructions](/docs/get-started/installation) for your operating system. -* Complete [Setting up (in BigQuery)](/docs/get-started/getting-started/getting-set-up/setting-up-bigquery#setting-up) and [Loading data (BigQuery)](/docs/get-started/getting-started/getting-set-up/setting-up-bigquery#loading-data). -* [Create a GitHub account](https://github.com/join) if you don't already have one. - -## Create a starter project - -After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models. - -### Create a repository - -The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join). - -1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`. -2. Select **Public** so the repository can be shared with others. You can always make it private later. -3. Leave the default values for all other settings. -4. Click **Create repository**. -5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes). - -### Create a project - -Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project. - -To create your dbt project: - -1. Make sure you have dbt Core installed and check the version using the `dbt --version` command: - - ```terminal - dbt --version - ``` - -2. Initiate the `jaffle_shop` project using the `init` command: - - ```terminal - dbt init jaffle_shop - ``` - -3. Navigate into your project's directory: - - ```terminal - cd jaffle_shop - ``` - -4. Use `pwd` to confirm that you are in the right spot: - - ```terminal - $ pwd - > Users/BBaggins/dbt-projects/jaffle_shop - ``` - -5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command. - -
          - -
          - -6. Update the following values in the `dbt_project.yml` file: - - - - ```yaml - name: jaffle_shop # Change from the default, `my_new_project` - - ... - - profile: jaffle_shop # Change from the default profile name, `default` - - ... - - models: - jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` - ... - ``` - - - -### Connect to BigQuery - -When developing locally, dbt connects to your using a [profile](/docs/get-started/connection-profiles), which is a yaml file with all the connection details to your warehouse. - -1. Create a file in the `~/.dbt/` directory named `profiles.yml`. -2. Move your BigQuery keyfile into this directory. -3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted. - - - - ```yaml - jaffle_shop: # this needs to match the profile in your dbt_project.yml file - target: dev - outputs: - dev: - type: bigquery - method: service-account - keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile - project: grand-highway-265418 # Replace this with your project id - dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo - threads: 1 - timeout_seconds: 300 - location: US - priority: interactive - ``` - - - -4. Run the `debug` command from your project to confirm that you can successfully connect: - - ```terminal - $ dbt debug - > Connection test: OK connection ok - ``` - -
          - -
          - -#### FAQs - - - - - - - -### Perform your first dbt run - -Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order. - -1. Enter the `run` command to build example models: - - ```terminal - dbt run - ``` - -You should have an output that looks like this: -
          - -
          - -### Commit your changes - -Commit your changes so that the repository contains the latest code. - -1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository). - - ```terminal - git init - git branch -M main - git add . - git commit -m "Create a dbt project" - git remote add origin https://github.com/USERNAME/dbt-tutorial.git - git push -u origin main - ``` - -2. Return to your GitHub repository to verify your new files have been added. - -## Build your first models - -Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! You will take a sample query and turn it into a model in your dbt project. - -### Checkout a new git branch - -Check out a new git branch to work on new code: - -1. Create a new branch by using the `checkout` command and passing the `-b` flag: - - ```terminal - $ git checkout -b add-customers-model - > Switched to a new branch `add-customer-model` - ``` - -### Build your first model - -1. Open your project in your favorite code editor. -2. Create a new SQL file in the `models` directory, named `models/customers.sql`. -3. Paste the following query into the `models/customers.sql` file. - - - -4. From the command line, enter `dbt run`. -
          - -
          - -When you return to the BigQuery console, you can `select` from this model. - -#### FAQs - - - - - - - -### Change the way your model is materialized - - - - - -### Delete the example models - - - -### Build models on top of other models - - - -### Next steps - - - -You can also explore: - -* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL. -* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the selects statements that are running and the python logging happening when dbt runs. - -## Test and document your project - -### Add tests to your models - - - -### Document your models - - - -3. Run `dbt docs serve` command to launch the documentation in a local website. - -#### FAQs - - - - - -#### Next steps - - - -### Commit updated changes - -You need to commit the changes you made to the project so that the repository has your latest code. - -1. Add all your changes to git: `git add -A` -2. Commit your changes: `git commit -m "Add customers model, tests, docs"` -3. Push your changes to your repository: `git push` -4. Navigate to your repository, and open a pull request to merge the code into your master branch. - -## Schedule a job - -We recommend using dbt Cloud to schedule a job. For more information about using dbt Core to schedule a job, see [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post or [deployments](/docs/deploy/deployments). diff --git a/website/docs/docs/get-started/getting-started/building-your-first-project.md b/website/docs/docs/get-started/getting-started/building-your-first-project.md deleted file mode 100644 index 1bbab1a1273..00000000000 --- a/website/docs/docs/get-started/getting-started/building-your-first-project.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: "Building your first project" -id: building-your-first-project -description: "" -sidebar_label: "Building your first project" ---- - -Once you set up your warehouse and connect to dbt Cloud, you are ready to build your first dbt project. - -In this guide, you will: - -* Build your first models -* Test and document your project -* Schedule a job to be run in your production environment diff --git a/website/docs/docs/get-started/getting-started/building-your-first-project/build-your-first-models.md b/website/docs/docs/get-started/getting-started/building-your-first-project/build-your-first-models.md deleted file mode 100644 index 48d859fdeaf..00000000000 --- a/website/docs/docs/get-started/getting-started/building-your-first-project/build-your-first-models.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Build your first models -id: build-your-first-models -description: "Now that you've set up the starter project, you can get to the fun part — building models!" ---- - -Now that you set up your sample project and had a successful run, you can get to the fun part — [building models](/docs/build/sql-models)! You will take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. - -## Checkout a new git branch - -1. Click **Develop** from the upper left of dbt Cloud. You need to create a new branch since the main branch is now set to read-only mode. - -2. Click **Create branch**, and name your branch `add-customers-model`. - - - - -## Build your first model - -1. Click **Develop** from the upper left of dbt Cloud. -2. Click the **...** next to the Models directory, then select **Create file**. -3. Name the file `models/customers.sql`, then click **Create**. -4. Copy the following query into the file and click **Save File**. - - - -5. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see three models under DETAILS. - -In the future, you would connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. - -#### FAQs - - - - - - - -## Change the way your model is materialized - - - -## Delete the example models - - - -## Build models on top of other models - - - -## Next steps - -Once you build your first model, you're ready to [test and document your project](/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project). - - diff --git a/website/docs/docs/get-started/getting-started/building-your-first-project/schedule-a-job.md b/website/docs/docs/get-started/getting-started/building-your-first-project/schedule-a-job.md deleted file mode 100644 index a6b363738b0..00000000000 --- a/website/docs/docs/get-started/getting-started/building-your-first-project/schedule-a-job.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: Schedule a job -id: schedule-a-job -description: In this part of the guide, we'll go through how you can schedule a job in dbt Cloud. ---- - -In this part of the guide, you will learn how to schedule a job to be run in your production environment. Scheduling a job is sometimes called _deploying a project_. - -As the `jaffle_shop` business gains more customers, and those customers create more orders, you will see more records added to your source data. Because you materialized the `customers` model as a table, you'll need to periodically rebuild your table to ensure that the data stays up-to-date. This update will happen when you run a job. - -## Commit your changes - -Now that you've built your customer model, you need to commit the changes you made to the project so that the repository has your latest code. - -1. Click **Commit** and add a message. For example, "Add customers model, tests, docs." -2. Click **merge to main** To add these changes to the main branch on your repo. - -## Create a deployment environment - -1. In the upper left, select **Deploy**, then click **Environments**. -2. Click **Create Environment**. -3. Name your deployment environment. For example, "Production." -4. Add a target dataset, for example, "Analytics." dbt will build into this dataset. For some warehouses this will be named "schema." -5. Click **Save**. - -## Create and run a job - -Jobs are a set of dbt commands that you want to run on a schedule. For example, `dbt run` and `dbt test`. - -1. After creating your deployment environment, you should be directed to the page for new environment. If not, select **Deploy** in the upper left, then click **Jobs**. -2. Click **Create one** and provide a name, for example "Production run", and link to the Environment you just created. -3. Scroll down to "Execution Settings" and select **Generate docs on run**. -4. Under "Commands," add these commands as part of your job if you don't see them: - * `dbt run` - * `dbt test` -5. For this exercise, **do NOT** set a schedule for your project to run -- while your organization's project **should** run regularly, there's no need to run this project on a schedule. -6. Select **Save**, then click **Run now** to run your job. -7. Click the run and watch its progress under "Run history." -8. Once the run is complete, click **View Documentation** to see the docs for your project. - -:::tip -Congratulations 🎉! You've just deployed your first dbt project! -::: - -### FAQs - - - -## Next steps - -Congratulations! Now that you've got a working dbt project, you can read about dbt [best practices](/guides/best-practices). - -You can improve your dbt skills with these fun exercises: - -* Turn your raw data references (for example, turn `` `dbt-tutorial`.jaffle_shop.orders``) into [sources](/docs/build/sources). -* Build a new models for `orders`, that uses the `payments` table to calculate the total order amount. -* Reorganize your project into [how we structure dbt projects](/blog/how-we-structure-our-dbt-projects). -* If you want a more in-depth learning experience, we recommend taking the [dbt Fundamentals on our dbt Learn online courses site](https://courses.getdbt.com/courses/fundamentals). - - -Here are some ways to learn more essential dbt skills: - -* Learn how to use Jinja in your project by reading the [Jinja tutorial](/docs/get-started/learning-more/using-jinja). -* Learn how to [connect to dbt Core using the CLI](/docs/get-started/getting-started-dbt-core). -* Refactor [legacy SQL to dbt SQL](/docs/get-started/learning-more/refactoring-legacy-sql). diff --git a/website/docs/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project.md b/website/docs/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project.md deleted file mode 100644 index db7fa7f3ec0..00000000000 --- a/website/docs/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project.md +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Test and document your project -id: test-and-document-your-project -description: Let's test and document our models to build trust in our data. ---- - -## Add tests to your models - - - -## Document your models - - - -3. Click the link above the file tree in the Develop interface to launch documentation in a new tab. - -#### FAQs - - - - -## Next steps - -Once you test and document your project, you're ready to [Schedule a job](/docs/get-started/getting-started/building-your-first-project/schedule-a-job). - - diff --git a/website/docs/docs/get-started/getting-started/create-a-project.md b/website/docs/docs/get-started/getting-started/create-a-project.md deleted file mode 100644 index 9ecd487d527..00000000000 --- a/website/docs/docs/get-started/getting-started/create-a-project.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: Create a project -id: create-a-project -description: "When you are ready to create projects in dbt, you can use either dbt Cloud or the dbt CLI." ---- - -Once you set up your data platform, you can create a project in dbt. When you're ready to create a project, you need to decide which interface best suits your needs for working with dbt. - -## Choose how to work with dbt - -You can work with dbt in two ways: - -* **dbt Cloud**: Edit files and run projects using the web-based Integrated Development Environment (IDE). -* **dbt CLI**: Edit files locally using a code editor, and run projects using the command line interface (CLI). - -To use the CLI, it's important that you know some basics of your terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily. As such, if you are new to programming, we recommend using **dbt Cloud** for this guide. - -If you wish to use the CLI, please follow the [installation instructions](/docs/get-started/installation) for your operating system. diff --git a/website/docs/docs/get-started/getting-started/getting-set-up.md b/website/docs/docs/get-started/getting-started/getting-set-up.md deleted file mode 100644 index 3ef1731bc38..00000000000 --- a/website/docs/docs/get-started/getting-started/getting-set-up.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: About setting up dbt Cloud -id: set-up-dbt-cloud -description: "Set up a sample project to get you started using dbt Cloud." -sidebar_label: "About set up" ---- - -Learn how to set up dbt Cloud using a sample project and one of the most common data platforms. You can select from [BigQuery](/docs/get-started/getting-started/getting-set-up/setting-up-bigquery), [Databricks](/docs/get-started/getting-started/getting-set-up/setting-up-databricks), [Redshift](/docs/get-started/getting-started/getting-set-up/setting-up-redshift), or [Snowflake](/docs/get-started/getting-started/getting-set-up/setting-up-snowflake). If you're not sure, then try [BigQuery](/docs/get-started/getting-started/getting-set-up/setting-up-bigquery). - -This guide will show you how to set up dbt and perform some key tasks. These tasks will illustrate how dbt establishes standard practices for your work: - -* Set up a warehouse with sample data -* Connect the warehouse to dbt -* Add a Git repository to dbt -* Execute a dbt transformation using `dbt run` -* Schedule a job or transformation - -If you want a more in-depth learning experience, we recommend taking the dbt Fundamentals on our [dbt Learn online courses site](https://courses.getdbt.com/). - -If you'd rather edit and run files locally using the dbt command line interface (dbt CLI) to connect to dbt Core, then you can refer to [Getting started using dbt Core](/docs/get-started/getting-started-dbt-core). diff --git a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-bigquery.md b/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-bigquery.md deleted file mode 100644 index 86ef5bfc6ef..00000000000 --- a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-bigquery.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -title: "Set up and connect BigQuery" -id: setting-up-bigquery -description: "Set up BigQuery with sample data and connect to dbt Cloud." -sidebar_label: "Set up and connect BigQuery" -pagination_next: docs/get-started/getting-started/building-your-first-project/build-your-first-models ---- - -## Introduction - -For the BigQuery project in the getting started guide, you'll learn how to set up BigQuery and connect it to dbt Cloud. - -This guide will walk you through: - -- Setting up a new BigQuery instance -- Accessing sample data in a public data set -- Connecting dbt Cloud to BigQuery - -## Prerequisites - -Before beginning this guide, make sure that you have access to [new or existing Google account](https://support.google.com/accounts/answer/27441?hl=en). You can use a personal or work account to set up BigQuery through [Google Cloud Platform (GCP)](https://cloud.google.com/free). - -## Setting up - - - -Before jumping into the steps below, login to your Google account. - -1. Navigate to the [BigQuery Console](https://console.cloud.google.com/bigquery). - - If you don't have a Google Cloud Platform account you will be asked to create one. - - If you do have one (or multiple) it will likely log you into your oldest account. Click your profile picture to the right and verify your are using the correct email account. - -2. Create a new project for this tutorial: - - If you've just created a BigQuery account, you'll be prompted to create a new project straight away. - - If you already have an existing organization, you can select the project drop down in the header bar, and create a new project from there. -
          - -
          - -3. Select **NEW PROJECT**. You should see a project name automatically populate. You can change the name to something more relevant, for example "dbt Learn - Bigquery Setup." - -
          - -
          - -4. Click **Create**. - -## Loading data - -BigQuery supports public data sets that can be directly queried, so we will show you how to access those datasets via select statements. Additionally, we will show you how to populate your own database objects with that data. - -1. Navigate to the [BigQuery Console](https://console.cloud.google.com/bigquery) again. Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify your are using the correct email account. - -2. Copy and paste the below queries into the Query Editor to validate that you are able to run them successfully. - - ```sql - select * from `dbt-tutorial.jaffle_shop.customers`; - select * from `dbt-tutorial.jaffle_shop.orders`; - select * from `dbt-tutorial.stripe.payment`; - ``` - -3. Verify you can see an output: -
          - -
          -4. Create datasets. Datasets in BigQuery are equivalent to schemas in a traditional database. - - 1. Find your project in the picker. Click the three dots to expose options. - 2. Click **Create dataset**. -
          - -
          - 3. Fill in `Dataset ID` as required. This will be used like schema in fully qualified references to your database objects, i.e. database.schema.table, so choose a name that fits the purpose, in this case we will be creating one now for `jaffle_shop` and one for `stripe` later. -
          - -
          - 4. Leave the default values in the rest of the fields: - - `Data location` can be left blank -- if selected, this determines the GCP location where your data is stored. The current default location is the US multi-region. All tables within this dataset will share this location. - - Even though it is unchecked, billing table expiration will be set automatically to 60 days, because billing has not been enabled for this project, so GCP defaults to deprecating tables. - - Allow Google to manage encryption. - - Click `CREATE DATASET`. - - Repeat steps i through iv for the second dataset, `stripe`. - -## Connecting to dbt Cloud - -You will learn how to connect dbt Cloud to Google BigQuery so that you can leverage the power of dbt to transform data in BigQuery. - -### Generate BigQuery credentials - - - -In order to let dbt connect to your warehouse, you'll need to generate a keyfile. This is analogous to using a database user name and password with most other data warehouses. - -1. Go to the [BigQuery credential wizard](https://console.cloud.google.com/apis/credentials/wizard). Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify your are using the correct email account. -2. Select **+ Create Credentials** then select **Service account**. -3. Type `dbt-user` in the Service account name field, then click **Create and Continue**. -4. Type and select **BigQuery Admin** in the Role field. -5. Click **Continue**. -6. Leave fields blank in the "Grant users access to this service account" section and click **Done**. -7. Click the service account that you just created. -8. Select **Keys**. -9. Click **Add Key** then select **Create new key**. -10. Select **JSON** as the key type then click **Create**. -11. You should be prompted to download the file. Save it locally to an easy-to-remember spot, with a clear filename. For example, `dbt-user-creds.json`. - -### Create a dbt Cloud account - - - - - -### Connect dbt Cloud to BigQuery - -Now let's set up the connection between dbt Cloud and BigQuery. - -1. Click **BigQuery** to set up your connection. -2. Click **Upload a Service Account JSON File** in BigQuery settings. -3. Select the JSON file you downloaded in [Generate BigQuery Credentials](#generate-bigquery-credentials). dbt Cloud will fill in all the necessary fields. -4. Click **Test** at the top. This verifies that dbt Cloud can access your BigQuery account. -5. If you see "Connection test Succeeded!" then click **Continue**. If it fails, you might need to go back and regenerate your BigQuery credentials. - -## Initialize your repository and start development - -### Setting up a managed repository - - - -### Initialize your dbt project - - - - -Congratulations! You have successfully completed the following: - -- Set up a new BigQuery instance -- Accessing sample data in a public data set -- Connected dbt Cloud to BigQuery - -## Next steps - - diff --git a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-databricks.md b/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-databricks.md deleted file mode 100644 index 3d43d312565..00000000000 --- a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-databricks.md +++ /dev/null @@ -1,262 +0,0 @@ ---- -title: "Set up and connect Databricks" -id: setting-up-databricks -description: "Set up Databricks with sample data and connect to dbt Cloud." -sidebar_label: "Set up and connect Databricks" -pagination_prev: docs/get-started/getting-started/set-up-dbt-cloud -pagination_next: docs/get-started/getting-started/building-your-first-project/build-your-first-models ---- - -## Introduction - -For the Databricks project in the getting started guide, you'll learn how to set up Databricks and connect it to dbt Cloud. - -This project will walk you through: - -- Setting up a Databricks account -- Loading training data into your Databricks account -- Configuring a SQL endpoint in Databricks -- Connecting dbt Cloud and Databricks - -## Prerequisites - -Before starting this tutorial, you will need the following: - -- Existing Cloud Provider account (AWS, GCP, Azure). -- Permissions to create an S3 bucket in said account. - -## Setting up - -1. Use your existing account or sign up for a Databricks account at [Try Databricks](https://databricks.com/). Complete the form with your user information. - -
          - -
          - -2. For the purpose of this tutorial, you will be selecting AWS as our cloud provider but if you use Azure or GCP internally, please choose one of them. The setup process will be similar. -3. Check your email to complete the verification process. -4. After setting up your password, you will be guided to choose a subscription plan. You will need to select either the `Premium` or `Enterprise` plan to access the SQL Compute functionality, required for using the SQL Endpoint for dbt. We have chosen `Premium` for this tutorial. Click `Continue` after selecting your plan. - -
          - -
          - -5. Click on `Get Started` when you come to this below page and then `Confirm` after you validate that you have everything needed. - -
          - -
          -
          - -
          - -6. Now it's time to create your first workspace. A Databricks workspace is an environment for accessing all of your Databricks assets. The workspace organizes objects like notebooks, SQL Endpoints, clusters, etc into one place. Provide the name of your workspace and choose the appropriate AWS region and click **Start Quickstart**. You might get the checkbox of `I have data in S3 that I want to query with Databricks`. You do not need to check this off for the purpose of this tutorial. - -
          - -
          - -7. By clicking on `Start Quickstart`, you will be redirected to AWS and asked to log in if you haven’t already. After logging in, you should see a page similar to this. - -
          - -
          - -:::tip -If you get a session error and don’t get redirected to this page, do not worry, go back to the Databricks UI and create a workspace from the interface. All you have to do is click **create workspaces**, choose the quickstart, fill out the form and click **Start Quickstart**. -::: - -8. There is no need to change any of the pre-filled out fields in the Parameters. Just add in your Databricks password under **Databricks Account Credentials**. Check off the Acknowledgement and click **Create stack**. -
          - -
          - -
          - -
          -9. Afterwards, you should land on the CloudFormation > Stacks page. Once the status becomes `CREATE_COMPLETE`, you will be ready to start. This process can take about 5 minutes so feel free to click refresh to refresh the status updates. -
          - -
          -10. Go back to the Databricks tab. You should see that your workspace is ready to use. -
          - -
          -11. Now let’s jump into the workspace. Click on `Open` and log into the workspace using the same login as you used to log into the account. - -Congrats! You are now ready to start working in the workspace. -
          - -
          - -## Loading data - -Our next step is to load some data to transform. Luckily for us, Databricks makes it really easy for us to upload data. - -1. First we need a SQL endpoint. Find the drop down menu and toggle into the SQL space. -
          - -
          -2. We will be setting up a SQL endpoint now. Select `SQL Endpoints` from the left hand side console. You will see that a default SQL Endpoint exists. -
          - -
          -3. Click **Start** on the Starter Endpoint. This will take a few minutes to get the necessary resources spun up. - -4. While you're waiting, download the three CSV files locally that you will need for this tutorial. You can find them here: - - [jaffle_shop_customers.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_customers.csv) - - [jaffle_shop_orders.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_orders.csv) - - [stripe_payments.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/stripe_payments.csv) - -5. Once the SQL Endpoint is up, click on the `Create` and then `Table` on the drop down menu. -
          - -
          - -6. Let's load the Jaffle Shop Customers data first. Drop in the `jaffle_shop_customers.csv` file into the UI. -
          - -
          - -7. Update the Table Attributes at the top: - - - data_catalog = hive_metastore - - database = default - - table = jaffle_shop_customers - - Make sure that the column data types are correct. The way you can do this is by hovering over the datatype icon next to the column name. - - ID = bigint - - FIRST_NAME = string - - LAST_NAME = string - -
          - -
          - -8. Click `Create` on the bottom once you’re done. - -9. Now let’s do the same for `Jaffle Shop Orders` and `Stripe Payments`. - -
          - -
          - -
          - -
          - -10. Once that's done, make sure you can query the training data. Navigate to the `SQL Editor` through the left hand menu. This will bring you to a query editor. -11. Ensure that you can run a `select *` from each of the tables with the following code snippets. - - ```sql - select * from default.jaffle_shop_customers - select * from default.jaffle_shop_orders - select * from default.stripe_payments - ``` - -
          - -
          - -12. To ensure any users who might be working on your dbt project has access to your object, run this command. - - ```sql - grant all privileges on schema default to users; - ``` - -Congratulations! At this point, you have created a Databricks account, loaded training data, and successfully set up a SQL end point to query the data. - -## Connecting to dbt Cloud - - There are two ways to connect dbt Cloud and Databricks. The first option is Partner Connect, which provides a streamlined setup to create your dbt Cloud account from within your new Databricks trial account. The second option is to create your dbt Cloud account separately and build the Databricks connection yourself. If you are looking to get started quickly, we recommend option 1. If you are looking to customize your setup from the very beginning and gain familiarity with the dbt Cloud setup flow, we recommend option 2. - -### Option 1: Connect dbt Cloud and Databricks with partner connect - -1. In the Databricks workspace, on the left-side console: click on `Partner Connect`. - -
          - -
          - -2. Select the dbt tile under `Data preparation and transformation`. -3. Click on `Next` when prompted to `Connect to partner`. This action will create a service principal, PAT token for that service principle, and SQL Endpoint for the dbt Cloud account to use. This does mean that you will have two SQL Endpoints at your disposal from the previous step and from using Partner Connect. - -
          - -
          - - -4. Click on `Connect to dbt Cloud`. -
          - -
          -5. After the new tab loads, you will see a form. If you already created a dbt Cloud account, you will be asked to provide an account name. If you haven't created account, you will be asked to provide an account name and password. - -
          - -
          - -6. After you have filled out the form and clicked on `Complete Registration`, you will be logged into dbt Cloud automatically. - - -### Option 2: Connect dbt Cloud and Databricks manually - -#### Get endpoint and token information - -To manually setup dbt Cloud, you will need the SQL Endpoint connection information and to generate a user token. You can find your SQL endpoint connection information by going to the `Databricks UI > SQL > SQL Endpoints > Starter Endpoint > Connection details`. Save this information because you will need it later. - - - -To generate a user token for your development credentials in dbt Cloud, click on `Settings` on the left side console (while still in the SQL part of the workspace). Click on `Personal Access Token` and provide a comment like `dbt Cloud development`. Save the token information somewhere because you will need it for the next part. -
          - -
          - -#### Create a dbt Cloud account - - - -#### Connect dbt Cloud to Databricks - -1. Choose **Databricks** to setup your connection. - - -2. For the name, write `Databricks` or another simple title. -3. For Databricks settings, reference your SQL endpoint connection details from step 6 of the previous section for each of the following fields: - - - Method will be ODBC - - Hostname comes from Server hostname - - Endpoint comes from the last part of HTTP path after `/endpoints` -
          - -
          - -4. For your Development Credentials, type: - - - `User` and `token` that you saved in a previous step. - - You’ll notice that the schema name has been auto created for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. - -5. Click **Test Connection** at the bottom. This verifies that dbt Cloud can access your Databricks workspace. -6. If the connection test succeeds, click **Next**. If it fails, you may need to check your Databricks settings and credentials. - -## Initialize your repository and start development - -If you used Partner Connect, you can skip to [initializing your dbt project](/setting-up-databricks#initialize-your-dbt-project) as the Partner Connect provides you with a managed repository. Otherwise, you will need to create your managed repository connection. - -### Setting up a managed repository - - - -### Initialize your dbt project - - - -Congratulations! You have successfully completed the following: - -- Set up a Databricks account -- Loaded training data into your Databricks account -- Configured a SQL endpoint in Databricks -- Connected dbt Cloud and Databricks - -## Next steps - - diff --git a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-redshift.md b/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-redshift.md deleted file mode 100644 index 85300aca8ce..00000000000 --- a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-redshift.md +++ /dev/null @@ -1,215 +0,0 @@ ---- -title: "Set up and connect Redshift" -id: setting-up-redshift -description: "Set up Redshift with sample data and connect to dbt Cloud." -sidebar_label: "Set up and connect Redshift" -pagination_prev: docs/get-started/getting-started/set-up-dbt-cloud -pagination_next: docs/get-started/getting-started/building-your-first-project/build-your-first-models ---- - -## Introduction - -For the Redshift project in the getting started guide, you'll learn how to set up Redshift and connect it to dbt Cloud. - -This guide will walk you through: - -- Setting up a Redshift cluster -- Loading training data into your Redshift account -- Connecting dbt Cloud and Redshift - -## Prerequisites - -Before beginning this tutorial you will need access to an **existing AWS account** with permissions to execute a CloudFormation template to create appropriate roles and a Redshift cluster. If you do not have an AWS account, head over to [Sign up for AWS](https://portal.aws.amazon.com/billing/signup#/start/email). - -## Setting up - -Let’s get started by accessing your AWS account and setting up Redshift. - -1. Sign into your AWS account on the [AWS sign in page](https://signin.aws.amazon.com/console) as a root user or IAM user depending on your level of access. -2. We will be using a CloudFormation template to quickly set up a Redshift instance. A CloudFormation template is a configuration file that will automatically spin up the necessary resources in AWS. Use the link below to start this process. (source: [cloudformation json file](https://github.com/aws-samples/aws-modernization-with-dbtlabs/blob/main/resources/cloudformation/create-dbtworkshop-infr)) - -**[Start CloudFormation Stack](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=dbt-workshop&templateURL=https://tpch-sample-data.s3.amazonaws.com/create-dbtworkshop-infr)** - -3. Choose next for each page until you reach the `Select acknowledgement checkbox`. Check the box for "I acknowledge that AWS CloudFormation might create IAM resources with custom names" and click `Create Stack`. You should land on the stack page with a `CREATE_IN_PROGRESS` status. - - - -4. When the stack status changes to “CREATE_COMPLETE”, click on the `Outputs` tab on the top to view information that you will use throughout the rest of this guide. Save those credentials for later by keeping this open in a tab. - -5. Type in `Redshift` to the search bar on the top and click on `Amazon Redshift` - - - -6. Confirm that your new Redshift Cluster is listed under Cluster overview. Click on the cluster name. - - - -7. Click on `Query Data`. You can choose the classic query editor or v2. We will be using the v2 version for the purpose of this guide. - -8. You may be asked to Configure account. For the purpose of this sandbox environment, we recommend selecting “Configure account”. - -9. Click on your cluster name in the list and fill out the credentials from the output of the stack. -- Database: `dbtworkshop` -- User Name: `dbtadmin` -- Password: *choose your own password and save it for later* - - - - - -10. Click on `Create Connection`. - -Congrats! You have your Redshift cluster. - -## Loading data - -Now we are going to load our sample data into the S3 bucket that our Cloudformation template created. S3 buckets are a cheap and simple way to store data outside of Redshift. - -1. The data used in this course is stored as CSVs in a public S3 bucket. You can use the following URLs to download these files. Download these to your computer to use in the following steps. -- [jaffle_shop_customers.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_customers.csv) -- [jaffle_shop_orders.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_orders.csv) -- [stripe_payments.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/stripe_payments.csv) - -2. Now we are going to use the S3 bucket that you created via CloudFormation and upload the files. Go to the search bar at the top and type in `S3` and click on S3. There will be sample data in the file already, feel free to ignore it or use it for other modeling exploration. - - - - -3. Click on the `name of the bucket` S3 bucket. If you have multiple S3 buckets, this will be the bucket that was listed under “Workshopbucket” on the Outputs page. The bucket will be prefixed with `dbt-data-lake`. - - - -4. Click on `Upload`, drag the three files into the UI, and click on `Upload` on the button. - - - -5. Save the name of the S3 bucket. It should look like this: `s3://dbt-data-lake-xxxx`. You will need it for the next section. -6. Now let’s go back to the Redshift query editor. Search for Redshift in the search bar, choose your cluster, and select Query data. -7. In your query editor, execute this query below to create the schemas that we will be placing your raw data into. You can highlight the statement and then click on Run to run them individually. If you are on the Classic Query Editor, you might need to input them separately into the UI. You should see these schemas listed under `dbtworkshop`. - -```sql -create schema if not exists jaffle_shop; -create schema if not exists stripe; -``` - -8. Now create the tables in your schema with these queries using the statements below. These will be populated as tables in the respective schemas. - -```sql -create table jaffle_shop.customers( - id integer, - first_name varchar(50), - last_name varchar(50) -); - -create table jaffle_shop.orders( - id integer, - user_id integer, - order_date date, - status varchar(50), - _etl_loaded_at timestamp default current_timestamp -); - -create table stripe.payment( - id integer, - orderid integer, - paymentmethod varchar(50), - status varchar(50), - amount integer, - created date, - _batched_at timestamp default current_timestamp -); -``` - -9. Now we need to copy the data from S3. **Be sure to update the S3 location, iam role, and region.** You can find the S3 and iam role in your outputs from the Cloudformation stack. - -```sql -copy jaffle_shop.customers( id, first_name, last_name) -from 's3://dbt-data-lake-xxxx/jaffle_shop_customers.csv' -iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' -region 'us-east-1' -delimiter ',' -ignoreheader 1 -acceptinvchars; - -copy jaffle_shop.orders(id, user_id, order_date, status) -from 's3://dbt-data-lake-xxxx/jaffle_shop_orders.csv' -iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' -region 'us-east-1' -delimiter ',' -ignoreheader 1 -acceptinvchars; - -copy stripe.payment(id, orderid, paymentmethod, status, amount, created) -from 's3://dbt-data-lake-xxxx/stripe_payments.csv' -iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' -region 'us-east-1' -delimiter ',' -ignoreheader 1 -Acceptinvchars; -``` - -Ensure that you can run a select * from each of the tables with the following code snippets. - -```sql -select * from jaffle_shop.customers; -select * from jaffle_shop.orders; -select * from stripe.payment; -``` - -Congratulations! At this point, you have created a Redshift instance and loaded training data. In the next section, we will walk through the next steps to connect dbt Cloud and Redshift. - -## Connecting to dbt Cloud - -#### Create a dbt Cloud account - - - -#### Connect dbt Cloud to Redshift - -Now let's set up the connection between dbt Cloud and Redshift - -1. Click **Redshift** to set up your connection. -
          - -
          - -2. For the name, write `Redshift` or another simple title. - -3. Enter your Redshift settings. Reference your credentials you saved from the CloudFormation template. - - Your hostname is the entire hostname. Make sure to drop the http:// at the beginning and any trailing slashes at the end. - - The port is `5439` - - The database is `dbtworkshop`. -
          - -
          - -4. Set your development credentials. These credentials will be used by dbt Cloud to connect to Redshift. Those credentials (as provided in your cloudformation output) will be: - - Username: `dbtadmin` - - password: *this was the password that you set earlier in the guide* - - Schema: You’ll notice that the schema name has been auto created for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. -
          - -
          - -4. Click **Test Connection** at the bottom. This verifies that dbt Cloud can access your Redshift cluster. -5. If the connection test succeeds, click **Next**. If it fails, you may need to check your Redshift settings and credentials. - -## Initialize your repository and start development - -### Setting up a managed repository - - - -### Initialize your dbt project - - - -Congratulations! You have successfully completed the following: - -- Set up a Redshift cluster -- Loaded training data into your Redshift account -- Connected dbt Cloud and Redshift - -## Next steps - - diff --git a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-snowflake.md b/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-snowflake.md deleted file mode 100644 index 86ad73d3d47..00000000000 --- a/website/docs/docs/get-started/getting-started/getting-set-up/setting-up-snowflake.md +++ /dev/null @@ -1,293 +0,0 @@ ---- -title: "Set up and connect Snowflake" -id: setting-up-snowflake -description: "Set up Snowflake with sample data and connect to dbt Cloud." -sidebar_label: "Set up and connect Snowflake" -pagination_prev: docs/get-started/getting-started/set-up-dbt-cloud -pagination_next: docs/get-started/getting-started/building-your-first-project/build-your-first-models ---- - -## Introduction - -For the Snowflake project in the getting started guide, you'll learn how to set up Snowflake and connect it to dbt Cloud. - -This guide will walk you through: - -* Setting up a Snowflake trial account -* Loading training data into your Snowflake account -* Creating a dbt Cloud account, either through Partner Connect or through the account flow -* Connecting dbt Cloud and Snowflake -* Setting up the dbt Cloud IDE, querying data, and doing your first dbt run - -## Prerequisites - -The only prerequisites for this guide are to have access to an email account for signing up for Snowflake and dbt Cloud. - -## Setting up - -You can start by signing up for a free trial on Snowflake: - -1. Sign up for a free trial by following [this link](https://signup.snowflake.com/) and completing the sign-up form. -2. Select the Enterprise edition, choose a cloud provider and region, and agree to the terms of service. - - You should consider organizational questions when choosing a cloud provider for a full implementation. For more information, see [Introduction to Cloud Platforms](https://docs.snowflake.com/en/user-guide/intro-cloud-platforms.html) in the Snowflake docs. For the purposes of this setup, all cloud providers and regions will work so choose whichever you’d like. -3. Click **GET STARTED**. -
          - -
          - -4. After submitting the sign-up form, you should receive an email asking you to activate your account. Click the link in the email and a new tab will open up where you’ll create your username and password. Complete the form and click **Get started**. -
          - -
          - -5. Congrats! Your workspace is ready for some data. Feel free to check out any of the getting started tooltips that Snowflake provides in the UI to familiarize yourself before moving on to the next section. -
          - -
          - -## Loading data - -Now we’re ready for some sample data. The data used here is stored as CSV files in a public S3 bucket and the following steps will guide you through how to prepare your Snowflake account for that data and upload it. - -1. If using the new Snowflake UI, create a new worksheet by clicking the "+ Worksheet" button in the upper right hand corner of the screen. - -

          - -

          - -2. Run the following commands to create a new virtual warehouse, two new databases (one for raw data, the other for future dbt development), and two new schemas (one for `jaffle_shop` data, the other for 'stripe' data). If you're curious to learn more about the naming conventions used, check out [this article](https://blog.getdbt.com/how-we-configure-snowflake/). Feel free to copy/paste from below: - -```sql -create warehouse transforming; -create database raw; -create database analytics; -create schema raw.jaffle_shop; -create schema raw.stripe; -``` - -

          - -

          - -3. Our next step will focus on creating **three** raw tables in the `raw` database and `jaffle_shop` and `stripe` schemas. Execute the tabbed code snippets below to create the customers, orders, and payment table and load the respective data. - - - - - - ```sql - ​​create table raw.jaffle_shop.customers - ( id integer, - first_name varchar, - last_name varchar - ); - - copy into raw.jaffle_shop.customers (id, first_name, last_name) - from 's3://dbt-tutorial-public/jaffle_shop_customers.csv' - file_format = ( - type = 'CSV' - field_delimiter = ',' - skip_header = 1 - ); - ``` - - - - - - ```sql - create table raw.jaffle_shop.orders - ( id integer, - user_id integer, - order_date date, - status varchar, - _etl_loaded_at timestamp default current_timestamp - ); - - copy into raw.jaffle_shop.orders (id, user_id, order_date, status) - from 's3://dbt-tutorial-public/jaffle_shop_orders.csv' - file_format = ( - type = 'CSV' - field_delimiter = ',' - skip_header = 1 - ); - ``` - - - - - - ```sql - create table raw.stripe.payment - ( id integer, - orderid integer, - paymentmethod varchar, - status varchar, - amount integer, - created date, - _batched_at timestamp default current_timestamp - ); - - copy into raw.stripe.payment (id, orderid, paymentmethod, status, amount, created) - from 's3://dbt-tutorial-public/stripe_payments.csv' - file_format = ( - type = 'CSV' - field_delimiter = ',' - skip_header = 1 - ); - ``` - - - - -

          - -

          - -

          - -

          - -

          - -

          - -6. Great! Your data is loaded and ready to go. Just to make sure, run the following commands to query your data and confirm that you see an output for each one. - -```sql -select * from raw.jaffle_shop.customers; -select * from raw.jaffle_shop.orders; -select * from raw.stripe.payment; -``` - -Now we’re ready to set up dbt Cloud! - -## Connecting to dbt Cloud - -There are two ways to connect dbt Cloud and Snowflake. The first option is Partner Connect, which provides a streamlined setup to create your dbt Cloud account from within your new Snowflake trial account. The second option is to create your dbt Cloud account separately and build the Snowflake connection yourself. If you are looking to get started quickly, we recommend **option 1**. If you are looking to customize your setup from the very beginning and gain familiarity with the dbt Cloud setup flow, we recommend **option 2**. - -### Option 1: Connect dbt Cloud and Snowflake with partner connect - -1. With your Snowflake account up and running with data, we’re ready to connect it with dbt Cloud. We’re going to use [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up your dbt Cloud account and project. Using Partner Connect will allow you to create a complete dbt account with your [Snowflake connection](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. -2. There’s a couple of ways you can access the Partner Connect page depending on if you’re navigating in the classic Snowflake UI or the new UI. - - * **Snowflake Classic UI:** If you’re using the classic version of the Snowflake UI, you can click the Partner Connect button in the top bar of your account. From there, click on the dbt tile to open up the connect box. - -

          - -

          - - * **Snowflake New UI:** If you’re using the new web interface, you’ll want to click on your name in the upper left hand corner and then click on Partner Connect in the drop down menu. You can scroll down to find the dbt tile, or search for dbt in the search bar and it will float to the top. Click on the tile to open up the connect box. - -

          - -

          - -

          - -

          - -3. Once you’ve clicked on the tile, a connection box will appear that will look slightly different depending on the route you took above, but will contain the same Optional Grant box towards the bottom. In both cases, you’ll want to type in or select the `RAW` and `ANALYTICS` databases. This will grant access for your new dbt user role to each database. - -

          - -

          - -

          - -

          - -4. After you’ve entered the database names using either option above, click "Connect". You should see a pop up window similar to the one of the options below. Click Activate. - -

          - -

          - -

          - -

          - -5. A new tab will be created that will take you to the dbt Cloud website. Here you’ll be asked to create an account name with password, as well as agree to the Terms of Service. Once that’s done, click Complete Registration. - -

          - -

          - -6. We have one slight tweak to make to the dbt Cloud interface to account for the `analytics` database and `transforming` warehouse created earlier. Click the gear icon in the upper right and select **Account Settings**. Choose the "Partner Connection Trial" project and select `snowflake` in the overview table. Select edit and update the fields `database` and `warehouse` to be `analytics` and `transforming`, respectively. - -

          - -

          - -

          - -

          - -7. Great! Your dbt Cloud account is now completely setup and connected to your Snowflake trial account with a [managed repository](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository). You can skip to the **[Initialize your repo and start development](#initialize-your-repository-and-start-development)** section to get started in the IDE. - -### Option 2: Connect dbt Cloud and Snowflake manually - -#### Create a dbt Cloud account - - - -#### Connect dbt Cloud to Snowflake - -Now let's formally set up the connection between dbt Cloud and Snowflake. - -1. Choose **Snowflake** to setup your connection. -
          - -
          -2. For the name, write `Snowflake` or another simple title. -2. Enter the following information under Snowflake settings. - * **Account:** Find your account by using the Snowflake trial account URL and removing `snowflakecomputing.com`. The order of your account information will vary by Snowflake version. For example, Snowflake's Classic console URL might look like: `oq65696.west-us-2.azure.snowflakecomputing.com`. The AppUI or Snowsight URL might look more like: `snowflakecomputing.com/west-us-2.azure/oq65696`. In both examples, your account will be: `oq65696.west-us-2.azure`. For more information, see "[Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html)" in the Snowflake documentation. -
          - * **Role:** Leave blank for now. You can update this to a default Snowflake role in the future. - * **Database:** `analytics`. This tells dbt to create new models in the analytics database. - * **Warehouse:** `transforming`. This tells dbt to use the transforming warehouse we created earlier. -
          - -
          - -3. Enter the following information under Development credentials. - * **Username:** The username you created for Snowflake. Note: The username is not your email address and is usually your first and last name together in one word. - * **Password:** The password you set when creating your Snowflake account - * **Schema:** You’ll notice that the schema name has been auto created for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. - * **Target name:** leave as default - * **Threads:** Leave as 4. This is the number of simultaneous connects that dbt Cloud will make to build models concurrently. -
          - -
          - -4. Click **Test Connection** at the bottom. This verifies that dbt Cloud can access your Snowflake account. -5. If the connection test succeeds, click **Next**. If it fails, you may need to check your Snowflake settings and credentials. - -## Initialize your repository and start development - -If you used Partner Connect, you can skip over to [initializing your dbt project](/setting-up-snowflake#initialize-your-dbt-project) as the Partner Connect sets you up with an managed repostiory already. If not, you will need to create your managed repository connection. - -### Setting up a managed repository - - - -### Initialize your dbt project - - - -Congratulations! You have successfully completed the following: - -- Set up a new Snowflake instance -- Loaded training data into your Snowflake account -- Connected dbt Cloud and Snowflake - -## Next steps - - diff --git a/website/docs/docs/get-started/getting-started/overview.md b/website/docs/docs/get-started/getting-started/overview.md deleted file mode 100644 index 635b80a0155..00000000000 --- a/website/docs/docs/get-started/getting-started/overview.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -title: About getting started -id: overview -sidebar_label: "About getting started" -description: "Create your first dbt project using a SQL query." ---- -
          - - - -Before you begin, you will need: - -* Warehouse with sample data. If you don't have this, you can use the [BigQuery project](/docs/get-started/getting-started/getting-set-up/setting-up-bigquery), which leverages public data sets. -* Basic understanding of Git. -* Basic understanding of SQL. - - -
          - - - - - -
          - -
          diff --git a/website/docs/docs/get-started/installation-overview.md b/website/docs/docs/get-started/installation-overview.md deleted file mode 100644 index cfb03a6b2cc..00000000000 --- a/website/docs/docs/get-started/installation-overview.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: "Installation overview" -id: "installation" -description: "You can install dbt Core using a few different tested methods." ---- - -You can install dbt Core on the command line by using one of these recommended methods: - -- [Use Homebrew to install dbt](/docs/get-started/homebrew-install) (recommended for MacOS + most popular plugins) -- [Use pip to install dbt](/docs/get-started/pip-install) -- [Use a Docker image to install dbt](/docs/get-started/docker-install) -- [Install dbt from source](/docs/get-started/source-install) - - -## About dbt adapters - -dbt works with a number of different data platforms (databases, query engines, and other SQL-speaking technologies). It does this by using a dedicated _adapter_ for each. When you install dbt, you'll also want to install the specific adapter for your database. For more details, see [Supported Data Platforms](supported-data-platforms). diff --git a/website/docs/docs/get-started/pip-install.md b/website/docs/docs/get-started/pip-install.md deleted file mode 100644 index 5075a3e5086..00000000000 --- a/website/docs/docs/get-started/pip-install.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: "Install with pip" -description: "You can use pip to install dbt Core and adapter plugins from the command line." ---- - -You need to use `pip` to install dbt Core on Windows or Linux operating systems. You should use [Homebrew](/docs/get-started/homebrew-install) for installing dbt Core on a MacOS. - -You can install dbt Core and plugins using `pip` because they are Python modules distributed on [PyPI](https://pypi.org/project/dbt/). We recommend using virtual environments when installing with `pip`. - - - - - -Once you know [which adapter](supported-data-platforms) you're using, you can install it as `dbt-`. For example, if using Postgres: - -```shell -pip install dbt-postgres -``` - -This will install `dbt-core` and `dbt-postgres` _only_: - -```shell -$ dbt --version -installed version: 1.0.0 - latest version: 1.0.0 - -Up to date! - -Plugins: - - postgres: 1.0.0 -``` - -All adapters build on top of `dbt-core`. Some also depend on other adapters: for example, `dbt-redshift` builds on top of `dbt-postgres`. In that case, you would see those adapters included by your specific installation, too. - -### Upgrading - -To upgrade a specific adapter plugin: - -```shell -pip install --upgrade dbt- -``` - -### Install dbt-core only - -If you're building a tool that integrates with dbt Core, you may want to install the core library alone, without a database adapter. Note that you won't be able to use dbt as a CLI tool. - -```shell -pip install dbt-core -``` - -### `pip install dbt` - -Note that, as of v1.0.0, `pip install dbt` is no longer supported and will raise an explicit error. Since v0.13, the PyPI package named `dbt` was a simple "pass-through" of `dbt-core` and the four original database adapter plugins. For v1, we formalized that split. - -If you have workflows or integrations that relied on installing the package named `dbt`, you can achieve the same behavior going forward by installing the same five packages that it used: - -```shell -pip install \ - dbt-core \ - dbt-postgres \ - dbt-redshift \ - dbt-snowflake \ - dbt-bigquery -``` - -Or, better yet, just install the package(s) you need! diff --git a/website/docs/docs/get-started/run-your-dbt-projects.md b/website/docs/docs/get-started/run-your-dbt-projects.md deleted file mode 100644 index ca7dd0b1757..00000000000 --- a/website/docs/docs/get-started/run-your-dbt-projects.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "Run your dbt projects" -id: "run-your-dbt-projects" ---- -You can run your dbt projects with [dbt Cloud](/docs/get-started/dbt-cloud-features) and [dbt Core](https://github.com/dbt-labs/dbt-core). dbt Cloud is a hosted application where you can develop directly from a web browser. dbt Core is an open source project where you can develop from the command line. - -Among other features, dbt Cloud provides a development environment to help you build, test, run, and [version control](/docs/collaborate/git-version-control) your project faster. It also includes an easier way to share your [dbt project's documentation](/docs/collaborate/build-and-view-your-docs) with your team. These development tasks are directly built into dbt Cloud for an _integrated development environment_ (IDE). Refer to [Develop in the Cloud](/docs/get-started/develop-in-the-cloud) for more details. - -With dbt Core, you can run your dbt projects from the command line. The command line interface (CLI) is available from your computer's terminal application such as Terminal and iTerm. When using the command line, you can run commands and do other work from the current working directory on your computer. Before running the dbt project from the command line, make sure you are working in your dbt project directory. Learning terminal commands such as `cd` (change directory), `ls` (list directory contents), and `pwd` (present working directory) can help you navigate the directory structure on your system. - -When running your project from dbt Core or dbt Cloud, the commands you commonly use are: - -- [dbt run](/reference/commands/run) — Runs the models you defined in your project -- [dbt build](/reference/commands/build) — Builds and tests your selected resources such as models, seeds, snapshots, and tests -- [dbt test](/reference/commands/test) — Executes the tests you defined for your project - -For information on all dbt commands and their arguments (flags), see the [dbt command reference](/reference/dbt-commands). If you want to list all dbt commands from the command line, run `dbt --help`. To list a dbt command’s specific arguments, run `dbt COMMAND_NAME --help` . - -## Related docs - -- [How we set up our computers for working on dbt projects](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) -- [Model selection syntax](/reference/node-selection/syntax) -- [Cloud IDE features](/docs/get-started/dbt-cloud-features#ide-features) -- [Does dbt offer extract and load functionality?](/faqs/Project/transformation-tool) diff --git a/website/docs/docs/introduction.md b/website/docs/docs/introduction.md index c54eca1918e..0aeef0201cb 100644 --- a/website/docs/docs/introduction.md +++ b/website/docs/docs/introduction.md @@ -1,11 +1,11 @@ --- title: "What is dbt?" id: "introduction" +pagination_next: null +pagination_prev: null --- -## About dbt - - + dbt compiles and runs your analytics code against your data platform, enabling you and your team to collaborate on a single source of truth for metrics, insights, and business definitions. This single source of truth, combined with the ability to define tests for your data, reduces errors when logic changes, and alerts you when issues arise. @@ -30,6 +30,7 @@ Read more about why we want to enable analysts to work more like software engine You can access dbt using dbt Core or dbt Cloud. dbt Cloud is built around dbt Core, but it also provides: - Web-based UI so it’s more accessible +- dbt Cloud-powered command line (CLI) to develop, test, version control dbt projects, and run dbt commands - Hosted environment so it’s faster to get up and running - Differentiated features, such as metadata, in-app job scheduler, observability, integrations with other tools, integrated development environment (IDE), and more. @@ -37,11 +38,12 @@ You can learn about plans and pricing on [www.getdbt.com](https://www.getdbt.com ### dbt Cloud -dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, and investigate data models all in one web-based UI. Read more about [Getting started with dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud) and [dbt Cloud features](/docs/get-started/dbt-cloud-features). +dbt Cloud is the fastest and most reliable way to deploy dbt. Develop, test, schedule, and investigate data models all in one web-based UI. It also natively supports developing using a command line with the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). +Learn more about [dbt Cloud features](/docs/cloud/about-cloud/dbt-cloud-features) and try one of the [dbt Cloud quickstarts](/quickstarts). ### dbt Core -dbt Core is an open-source tool that enables data teams to transform data using analytics engineering best practices. You can install and use dbt Core on the command line. Read more about [Getting started with dbt Core](/docs/get-started/getting-started-dbt-core). +dbt Core is an open-source tool that enables data teams to transform data using analytics engineering best practices. You can install and use dbt Core on the command line. Learn more with the [quickstart for dbt Core](/quickstarts/codespace). ## The power of dbt @@ -60,8 +62,7 @@ As a dbt user, your main focus will be on writing models (i.e. select queries) t ### Related docs -- [Getting started with dbt Cloud](/docs/get-started/getting-started/set-up-dbt-cloud) -- [Getting started with dbt Core](/docs/get-started/getting-started-dbt-core) +- [Quickstarts for dbt](/quickstarts) - [Best practice guides](/guides/best-practices) - [What is a dbt Project?](/docs/build/projects) -- [dbt run](/docs/get-started/run-your-dbt-projects) +- [dbt run](/docs/running-a-dbt-project/run-your-dbt-projects) diff --git a/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md b/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md new file mode 100644 index 00000000000..b3b6ffb3e45 --- /dev/null +++ b/website/docs/docs/running-a-dbt-project/run-your-dbt-projects.md @@ -0,0 +1,37 @@ +--- +title: "Run your dbt projects" +id: "run-your-dbt-projects" +pagination_prev: null +--- +You can run your dbt projects with [dbt Cloud](/docs/cloud/about-cloud/dbt-cloud-features) or [dbt Core](https://github.com/dbt-labs/dbt-core): + +- **dbt Cloud**: A hosted application where you can develop directly from a web browser using the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud). It also natively supports developing using a command line interface, [dbt Cloud CLI](/docs/cloud/cloud-cli-installation). Among other features, dbt Cloud provides: + + - Development environment to help you build, test, run, and [version control](/docs/collaborate/git-version-control) your project faster. + - Share your [dbt project's documentation](/docs/collaborate/build-and-view-your-docs) with your team. + - Integrates with the dbt Cloud IDE, allowing you to run development tasks and environment in the dbt Cloud UI for a seamless experience. + - The dbt Cloud CLI to develop and run dbt commands against your dbt Cloud development environment from your local command line. + - For more details, refer to [Develop in the Cloud](/docs/cloud/about-cloud-develop). + +- **dbt Core**: An open source project where you can develop from the [command line](/docs/core/about-dbt-core). + +The dbt Cloud CLI and dbt Core are both command line tools that enable you to run dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). + +The command line is available from your computer's terminal application such as Terminal and iTerm. With the command line, you can run commands and do other work from the current working directory on your computer. Before running the dbt project from the command line, make sure you are working in your dbt project directory. Learning terminal commands such as `cd` (change directory), `ls` (list directory contents), and `pwd` (present working directory) can help you navigate the directory structure on your system. + +In dbt Cloud or dbt Core, the commands you commonly use are: + +- [dbt run](/reference/commands/run) — Runs the models you defined in your project +- [dbt build](/reference/commands/build) — Builds and tests your selected resources such as models, seeds, snapshots, and tests +- [dbt test](/reference/commands/test) — Executes the tests you defined for your project + +For information on all dbt commands and their arguments (flags), see the [dbt command reference](/reference/dbt-commands). If you want to list all dbt commands from the command line, run `dbt --help`. To list a dbt command’s specific arguments, run `dbt COMMAND_NAME --help` . + +## Related docs + +- [How we set up our computers for working on dbt projects](https://discourse.getdbt.com/t/how-we-set-up-our-computers-for-working-on-dbt-projects/243) +- [Model selection syntax](/reference/node-selection/syntax) +- [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) +- [Cloud IDE features](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud#ide-features) +- [Does dbt offer extract and load functionality?](/faqs/Project/transformation-tool) +- [Why does dbt compile need a data platform connection](/faqs/Warehouse/db-connection-dbt-compile) diff --git a/website/docs/docs/running-a-dbt-project/using-the-dbt-ide.md b/website/docs/docs/running-a-dbt-project/using-the-dbt-ide.md index 61e93ab031f..f41bceab12d 100644 --- a/website/docs/docs/running-a-dbt-project/using-the-dbt-ide.md +++ b/website/docs/docs/running-a-dbt-project/using-the-dbt-ide.md @@ -8,8 +8,8 @@ The dbt Integrated Development Environment (IDE) is a single interface for build ### Requirements - The dbt IDE is powered by [dbt-rpc](/reference/commands/rpc) which has been overhauled in dbt v0.15.0. In order to use the IDE, your dbt project must be compatible with dbt v0.15.0. -- To use the IDE, you must have a [Developer License](/docs/collaborate/manage-access/seats-and-users). -- Write access must be enabled for your dbt repository in dbt Cloud. See [Connecting your GitHub Account](/docs/collaborate/git/connect-github) and [Importing a project by git URL](/docs/collaborate/git/import-a-project-by-git-url) for detailed setup instructions. +- To use the IDE, you must have a [Developer License](/docs/cloud/manage-access/seats-and-users). +- Write access must be enabled for your dbt repository in dbt Cloud. See [Connecting your GitHub Account](/docs/cloud/git/connect-github) and [Importing a project by git URL](/docs/cloud/git/import-a-project-by-git-url) for detailed setup instructions. ### Creating a development environment New dbt Cloud accounts will automatically be created with a Development Environment for the project created during setup. If you have an existing dbt Cloud account, you may need to create a Development Environment manually to use the dbt IDE. @@ -30,7 +30,10 @@ New dbt Cloud accounts should have developer credentials created automatically a -New users on existing accounts *might not* have their development credentials already configured. To manage your development credentials, go to the [Credentials](https://cloud.getdbt.com/next/settings/profile#credentials) section. Select the relevant project in the list. After entering your developer credentials, you'll be able to access the dbt IDE. +New users on existing accounts *might not* have their development credentials already configured. +To manage your development credentials: +1. Navigate to your **Credentials** under **Your Profile** settings, which you can access at `https://YOUR_ACCESS_URL/settings/profile#credentials`, replacing `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. +2. Select the relevant project in the list. After entering your developer credentials, you'll be able to access the dbt IDE. @@ -43,7 +46,7 @@ This video explores entering the IDE and compiling and running SQL for a dbt pro ### Running dbt projects -This video explores running dbt projects in the dbt IDE. For a full list of the commands that can be run in the IDE, consult the [dbt Command reference](dbt-commands). +This video explores running dbt projects in the dbt IDE. For a full list of the commands that can be run in the IDE, consult the [dbt Command reference](/reference/dbt-commands). diff --git a/website/docs/docs/running-a-dbt-project/using-threads.md b/website/docs/docs/running-a-dbt-project/using-threads.md new file mode 100644 index 00000000000..5eede7abc27 --- /dev/null +++ b/website/docs/docs/running-a-dbt-project/using-threads.md @@ -0,0 +1,26 @@ +--- +title: "Using threads" +id: "using-threads" +sidebar_label: "Use threads" +description: "Understand what threads mean and how to use them." +pagination_next: null +--- + +When dbt runs, it creates a directed acyclic graph (DAG) of links between models. The number of threads represents the maximum number of paths through the graph dbt may work on at once – increasing the number of threads can minimize the run time of your project. + +For example, if you specify `threads: 1`, dbt will start building only one model, and finish it, before moving onto the next. Specifying `threads: 8` means that dbt will work on _up to_ 8 models at once without violating dependencies – the actual number of models it can work on will likely be constrained by the available paths through the dependency graph. + +There's no set limit of the maximum number of threads you can set – while increasing the number of threads generally decreases execution time, there are a number of things to consider: +* Increasing the number of threads increases the load on your warehouse, which may impact other tools in your data stack. For example, if your BI tool uses the same compute resources as dbt, their queries may get queued during a dbt run. +* The number of concurrent queries your database will allow you to run may be a limiting factor in how many models can be actively built – some models may queue while waiting for an available query slot. + +Generally the optimal number of threads depends on your data warehouse and its configuration. It’s best to test different values to find the best number of threads for your project. We recommend setting this to 4 to start with. + +You can use a different number of threads than the value defined in your target by using the `--threads` option when executing a dbt command. + +You will define the number of threads in your `profiles.yml` file (for dbt Core users only), dbt Cloud job definition, and dbt Cloud development credentials under your profile. + + +## Related docs +- [About profiles.yml](https://docs.getdbt.com/reference/profiles.yml) +- [dbt Cloud job scheduler](/docs/deploy/job-scheduler) diff --git a/website/docs/docs/supported-data-platforms.md b/website/docs/docs/supported-data-platforms.md index c7057eb4766..a8e146f49d0 100644 --- a/website/docs/docs/supported-data-platforms.md +++ b/website/docs/docs/supported-data-platforms.md @@ -1,86 +1,46 @@ --- title: "Supported data platforms" id: "supported-data-platforms" +sidebar_label: "Supported data platforms" +description: "Connect dbt to any data platform in dbt Cloud or dbt Core, using a dedicated adapter plugin" +hide_table_of_contents: true +pagination_next: "docs/connect-adapters" +pagination_prev: null --- -dbt connects to and runs SQL against your database, warehouse, lake, or query engine. We group all of these SQL-speaking things into one bucket called _data platforms_. dbt can be extended to any data platform using a dedicated _adapter plugin_. These plugins are built as Python modules that dbt Core discovers if they are installed on your system. All the adapters listed below are open source and free to use, just like dbt Core. +dbt connects to and runs SQL against your database, warehouse, lake, or query engine. These SQL-speaking platforms are collectively referred to as _data platforms_. dbt connects with data platforms by using a dedicated adapter plugin for each. Plugins are built as Python modules that dbt Core discovers if they are installed on your system. Read [What are Adapters](/guides/dbt-ecosystem/adapter-development/1-what-are-adapters) for more info. -To learn more about adapters, check out [What Are Adapters](/guides/advanced/adapter-development/1-what-are-adapters). +You can [connect](/docs/connect-adapters) to adapters and data platforms natively in dbt Cloud or install them manually using dbt Core. -## Supported Data Platforms +You can also further customize how dbt works with your specific data platform via configuration: see [Configuring Postgres](/reference/resource-configs/postgres-configs) for an example. -### Verified Adapters +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; -| Data Platform (click to view setup guide) | latest verified version | -| ----------------------------------------- | ------------------------ | -| [AlloyDB](alloydb-setup) | (same as `dbt-postgres`) | -| [Azure Synapse](azuresynapse-setup) | 1.3.0 :construction: | -| [BigQuery](bigquery-setup) | 1.2.0 | -| [Databricks](databricks-setup) | 1.2.0 :construction: | -| [Dremio](dremio-setup) | 1.3.0 :construction: | -| [Postgres](postgres-setup) | 1.2.0 | -| [Redshift](redshift-setup) | 1.2.0 | -| [Snowflake](snowflake-setup) | 1.2.0 | -| [Spark](spark-setup) | 1.2.0 | -| [Starburst & Trino](trino-setup) | 1.2.0 :construction: | -:construction:: Verification in progress + -### Community Adapters +## Types of Adapters -| Data Platforms (click to view setup guide) | | | -| ----------------------------------------------- | --------------------------------| ---------------------------------| -| [Athena](athena-setup) | [Hive](hive-setup) | [SingleStore](singlestore-setup) | -| [Clickhouse](clickhouse-setup) | [Impala](impala-setup) | [SQLite](sqlite-setup) | -| [IBM DB2](ibmdb2-setup) | [iomete](iomete-setup) | [SQL Server & Azure SQ](mssql-setup) | -| [DuckDB](duckdb-setup) | [Layer](layer-setup) | [AzureSynapse](azuresynapse-setup) | -| [Dremio](dremio-setup) | [Materialize](materialize-setup) | [Teradata](teradata-setup)| -| [Exasol Analytics](exasol-setup) | [MindsDB](mindsdb-setup) | [TiDB](tidb-setup)| -| [Firebolt](firebolt-setup) | [MySQL](mysql-setup) | [Vertica](vertica-setup)| -| [AWS Glue](glue-setup) | [Oracle](oracle-setup) | -| [Greenplum](greenplum-setup) | [Rockset](rockset-setup) +There are three types of adapters available today: +- **Verified** — [Verified adapters](verified-adapters) are those that have completed a rigorous verification process in collaboration with dbt Labs. +- **Trusted** — [Trusted adapters](trusted-adapters) are those where the adapter maintainers have agreed to meet a higher standard of quality. +- **Community** — [Community adapters](community-adapters) are open-source and maintained by community members. -## Adapter Installation +### Verified adapters -With a few exceptions [^1], all adapters listed below can be installed from PyPI using `pip install `. The installation will include `dbt-core` and any other required dependencies, which may include both other dependencies and even other adapter plugins. Read more about [installing dbt](/docs/get-started/installation). +The following are **Verified adapters** ✓ you can connect to either in dbt Cloud or dbt Core: -## Adapter Taxonomy +import AdaptersVerified from '/snippets/_adapters-verified.md'; -### Verified by dbt Labs + -In order to provide a more consistent and reliable experience, dbt Labs has a rigorous process by which we verify adapter plugins. The process covers aspects of development, documentation, user experience, and maintenance. These adapters earn a **Verified** designation so that users can have a certain level of trust and expectation when they use them. To learn more, see [Verifying a new adapter](/guides/advanced/adapter-development/7-verifying-a-new-adapter). +### Trusted adapters -We also welcome and encourage adapter plugins from the dbt community (see the below [Contributing to a pre-existing adapter](#contributing-to-a-pre-existing-adapter)). Please be mindful that these community maintainers are intrepid volunteers who donate their time and effort — so be kind, understanding, and help out where you can! +The following are **Trusted adapters** ✓ you can connect to in dbt Core: -### Maintainers +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; -Who made and maintains an adapter is certainly relevant, but we recommend using an adapter's verification status to determine the quality and health of an adapter. So far there are three categories of maintainers: + -| Supported by | Maintained By | -| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| dbt Labs | dbt Labs maintains a set of adapter plugins for some of the most common databases, warehouses, and platforms. As for why particular data platforms were chosen, see ["Why Verify an Adapter"](7-verifying-a-new-adapter#why-verify-an-adapter) | -| Partner | These adapter plugins are built and maintained by the same people who build and maintain the complementary data technology. | -| Community | These adapter plugins are contributed and maintained by members of the community. 🌱 | +
          * Install these adapters using dbt Core as they're not currently supported in dbt Cloud.
          - -## Contributing to dbt-core adapters - -### Contributing to a pre-existing adapter - -Community-supported plugins are works in progress, and anyone is welcome to contribute by testing and writing code. If you're interested in contributing: - -- Join both the dedicated channel, [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM), in [dbt Slack](https://community.getdbt.com/) and the channel for your adapter's data store (see **Slack Channel** column of above tables) -- Check out the open issues in the plugin's source repository (follow relevant link in **Adapter Repository** column of above tables) - -### Creating a new adapter - -If you see something missing from the lists above, and you're interested in developing an integration, read more about adapters and how they're developed in the [Adapter Development](/guides/advanced/adapter-development/1-what-are-adapters) section. - -If you have a new adapter, please add it to this list using a pull request! See [Documenting your adapter](5-documenting-a-new-adapter) for more information. - -[^1]: Here are the two different adapters. Use the PyPI package name when installing with `pip` - - | Adapter repo name | PyPI package name | - | ----------------- | -------------------- | - | `dbt-athena` | `dbt-athena-adapter` | - | `dbt-layer` | `dbt-layer-bigquery` | diff --git a/website/docs/docs/trusted-adapters.md b/website/docs/docs/trusted-adapters.md new file mode 100644 index 00000000000..08191e8ea42 --- /dev/null +++ b/website/docs/docs/trusted-adapters.md @@ -0,0 +1,41 @@ +--- +title: "Trusted adapters" +id: "trusted-adapters" +hide_table_of_contents: true +--- + +Trusted adapters are adapters not maintained by dbt Labs, that we feel comfortable recommending to users for use in production. + +Free and open-source tools for the data professional are increasingly abundant. This is by-and-large a *good thing*, however it requires due diligence that wasn't required in a paid-license, closed-source software world. As a user, there are questions to answer important before taking a dependency on an open-source project. The trusted adapter designation is meant to streamline this process for end users. + +
          Considerations for depending on an open-source project + +1. Does it work? +2. Does anyone "own" the code, or is anyone liable for ensuring it works? +3. Do bugs get fixed quickly? +4. Does it stay up-to-date with new Core features? +5. Is the usage substantial enough to self-sustain? +pendency on this library? + +
          + +### Trusted adapter specifications + +See [Building a Trusted Adapter](/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter) for more information, particularly if you are an adapter maintainer considering having your adapter be added to the trusted list. + +### Trusted vs Verified + +The Verification program exists to highlight adapters that meets both of the following criteria: + +- the guidelines given in the Trusted program, +- formal agreements required for integration with dbt Cloud + +For more information on the Verified Adapter program, reach out the [dbt Labs partnerships team](mailto:partnerships@dbtlabs.com) + +### Trusted adapters + +The following are **Trusted adapters** ✓ you can connect to in dbt Core: + +import AdaptersTrusted from '/snippets/_adapters-trusted.md'; + + diff --git a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md index 9abd608300e..a398ace164e 100644 --- a/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md +++ b/website/docs/docs/use-dbt-semantic-layer/avail-sl-integrations.md @@ -1,28 +1,64 @@ --- title: "Available integrations" id: avail-sl-integrations -description: "Review a wide range of partners you can integrate and query with the dbt Semantic Layer." +description: "Discover the diverse range of partners that seamlessly integrate with the powerful dbt Semantic Layer, allowing you to query and unlock valuable insights from your data ecosystem." +tags: [Semantic Layer] sidebar_label: "Available integrations" +hide_table_of_contents: true +meta: + api_name: dbt Semantic Layer APIs --- -# Available integrations + -A wide variety of data applications across the modern data stack natively integrate with the dbt Semantic Layer and dbt metrics — from Business Intelligence tools to notebooks, data catalogs, and more. -There are two categories of partner integrations: +There are a number of data applications that seamlessly integrate with the dbt Semantic Layer, powered by MetricFlow, from business intelligence tools to notebooks, spreadsheets, data catalogs, and more. These integrations allow you to query and unlock valuable insights from your data ecosystem. + +Use the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) to simplify metric queries, optimize your development workflow, and reduce coding. This approach also ensures data governance and consistency for data consumers. + +import AvailIntegrations from '/snippets/_sl-partner-links.md'; + + + +## Other integrations + +You can also integrate the following tools with the dbt Semantic Layer: +- [Push.ai](https://docs.push.ai/semantic-layer-integrations/dbt-semantic-layer) +- [Delphi](delphihq.com) +- KlipFolio Power Metrics - Documentation coming soon + +### Custom integration + +- You can create custom integrations using different languages and tools. We support connecting with JDBC, ADBC, and GraphQL APIs. For more info, check out [our examples on GitHub](https://github.com/dbt-labs/example-semantic-layer-clients/). +- You can also connect to tools that allow you to write SQL. These tools must meet one of the two criteria: + + - Supports a generic JDBC driver option (such as DataGrip) or + - Uses Arrow Flight SQL JDBC driver version 12.0.0 or higher. -- **dbt Semantic Layer partners** - Semantic Layer integrations are capable of interactively querying dbt metrics, surfacing the underlying data in partner tools. This refers to a group of dbt Labs partners who have full end-to-end integrations with the dbt Semantic Layer and leverage the dbt Server. +## Related docs + +- {frontMatter.meta.api_name} to learn how to integrate and query your metrics in downstream tools. +- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) +- [Hex dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. + + + + -- **Metrics Ready partners** - Metrics Ready integrations facilitate building, discovery, and collaborating on dbt metric definitions. +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; -For information on the different integration partners, their documentation, and more, review the [dbt Semantic Layer integrations](https://www.getdbt.com/product/semantic-layer-integrations) page. + - +A wide variety of data applications across the modern data stack natively integrate with the dbt Semantic Layer and dbt metrics — from Business Intelligence tools to notebooks, data catalogs, and more. + +The dbt Semantic Layer integrations are capable of querying dbt metrics, importing definitions, surfacing the underlying data in partner tools, and leveraging the dbt Server. +For information on the partner integrations, their documentation, and more — refer to the [dbt Semantic Layer integrations](https://www.getdbt.com/product/semantic-layer-integrations) page. + + ## Related docs -- [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-semantic-layer) to learn more about the dbt Semantic Layer. -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) page for more information on plan availability. -- [Public Preview information](/docs/use-dbt-semantic-layer/quickstart-semantic-layer#public-preview) to understand what Public Preview for the dbt Semantic Layer means. -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) to understand best practices for designing and structuring metrics in your dbt project. +- [About the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) + + diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md deleted file mode 100644 index bca1ff3549f..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/dbt-semantic-layer.md +++ /dev/null @@ -1,162 +0,0 @@ ---- -title: "dbt Semantic Layer" -id: dbt-semantic-layer -description: "Introducing the dbt Semantic Layer" -sidebar_label: "dbt Semantic Layer" ---- - - - -The dbt Semantic Layer allows data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. - -The result? You have less duplicative coding for data teams and more consistency for data consumers. - -The dbt Semantic Layer has four main parts: - -- Define your metrics in version-controlled dbt project code -- Import your metric definitions via the [Metadata API](/docs/dbt-cloud-apis/metadata-api) -- Query your metric data via the dbt Proxy Server -- Explore and analyze dbt metrics in downstream tools - - - - -### What makes the dbt Semantic Layer different? - -The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. - - -## Prerequisites -To use the dbt Semantic Layer, you’ll need to meet the following: - - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.3 or higher
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=1.3.0", "<1.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but won't be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          - -
          - - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.2 (latest)
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=0.3.0", "<0.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but won't be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          - -
          - - - -## Public Preview - -The dbt Semantic Layer is currently available for Public Preview, which means: - -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. - -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Metadata API](/docs/dbt-cloud-apis/metadata-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Metadata API. - -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). - -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. - -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. - -## Product architecture - -The dbt Semantic Layer product architecture includes four primary components: - -| Components | Information | Developer plans | Team plans | Enterprise plans | License | -| --- | --- | :---: | :---: | :---: | --- | -| **[dbt metrics](/docs/build/metrics)** | Allows you to define metrics in dbt Core. | ✅ | ✅ | ✅ | Open source, Core | -| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | -| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

          _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | -| **[Metadata API](/docs/dbt-cloud-apis/metadata-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Metadata API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise | - - - -dbt Semantic Layer integrations will: - -- Leverage the Metadata API to fetch a list of objects and their attributes, like metrics -- Generate a dbt-SQL statement -- Then query the SQL proxy to evaluate the results of this statement - - -## Manage metrics - -:::info 📌 - -New to dbt or metrics? Check out our [Getting Started guide](/docs/get-started/getting-started/overview) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. - -::: - -If you're not sure whether to define a metric in dbt or not, ask yourself the following: - -> *Is this something our teams consistently need to report on?* - -An important business metric should be: - -- Well-defined (the definition is agreed upon throughout the entire organization) -- Time-bound (able to be compared across time) - -A great example of this is **revenue** — it can be aggregated on multiple levels (weekly, monthly, etc) and is key for the broader business to understand. - -- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` -- ❌ `1-off experimental metric` - - -### Design and define metrics - -**Design metrics** -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -**Define metrics** -You can define your metrics in `.yml` files nested under a metrics key and to design or define your own metrics in your dbt project, review the following documents:
          - -- [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project -- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) -- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started -- [Understanding the components of the dbt Semantic Layer](https://docs.getdbt.com/blog/understanding-the-components-of-the-dbt-semantic-layer) blog post to see further examples - -Review our helpful metrics video below, which explains what metrics are, why they're important and how you can get started: - - - -## Related questions - -
          - How are you storing my data? -
          -
          The dbt Semantic Layer does not store, or cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it is never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers, to your connecting data tool.
          -
          -
          -
          - Is the dbt Semantic Layer open source? -
          -
          Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Metadata API are not open source.



          - -During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



          - -
            -
          • dbt Core users can define metrics in their dbt Core projects and calculate them using macros from the metrics package. To use the dbt Semantic Layer integrations, users will need to have a dbt Cloud account.


          • -
          • Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API.


          • -
          • Team and Enterprise accounts will be able to set up the Semantic Layer and Metadata API in the integrated partner tool to import metric definition.
          • -
          -
          -
          - Is there a dbt Semantic Layer discussion hub? -
          -
          Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. -
          -
          -
          -

          diff --git a/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md new file mode 100644 index 00000000000..8868c68ed20 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/dbt-sl.md @@ -0,0 +1,160 @@ +--- +title: "dbt Semantic Layer" +id: dbt-sl +description: "Learn how the dbt Semantic Layer enables data teams to centrally define and query metrics." +sidebar_label: "About the dbt Semantic Layer" +tags: [Semantic Layer] +hide_table_of_contents: true +pagination_next: "docs/use-dbt-semantic-layer/quickstart-sl" +pagination_prev: null +--- + + + + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies the process of defining and using critical business metrics, like `revenue` in the modeling layer (your dbt project). By centralizing metric definitions, data teams can ensure consistent self-service access to these metrics in downstream data tools and applications. The dbt Semantic Layer eliminates duplicate coding by allowing data teams to define metrics on top of existing models and automatically handles data joins. + +Moving metric definitions out of the BI layer and into the modeling layer allows data teams to feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. + +Refer to the [Why we need a universal semantic layer](https://www.getdbt.com/blog/universal-semantic-layer/) blog post to learn more. + +## Explore the dbt Semantic Layer + + +import Features from '/snippets/_sl-plan-info.md' + + + +
          + + + + + + + + + + + +
          + +
          + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + +The dbt Semantic Layer allows your data teams to centrally define essential business metrics like `revenue`, `customer`, and `churn` in the modeling layer (your dbt project) for consistent self-service within downstream data tools like BI and metadata management solutions. The dbt Semantic Layer provides the flexibility to define metrics on top of your existing models and then query those metrics and models in your analysis tools of choice. + +Resulting in less duplicate coding for data teams and more consistency for data consumers. + +The dbt Semantic Layer has these main parts: + +- Define your metrics in version-controlled dbt project code using [MetricFlow](/docs/build/about-metricflow) + * dbt_metrics is now deprecated +- Import your metric definitions using the [Discovery API](/docs/dbt-cloud-apis/discovery-api) +- Query your metric data with the dbt Proxy Server +- Explore and analyze dbt metrics in downstream tools + +### What makes the dbt Semantic Layer different? + +The dbt Semantic Layer reduces code duplication and inconsistency regarding your business metrics. By moving metric definitions out of the BI layer and into the modeling layer, your data teams can feel confident that different business units are working from the same metric definitions, regardless of their tool of choice. If a metric definition changes in dbt, it’s refreshed everywhere it’s invoked and creates consistency across all applications. You can also use the dbt Semantic Layer to query models and use macros. + + +## Prerequisites + + + + + + +## Manage metrics + +:::info 📌 + +New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. + +::: + +If you're not sure whether to define a metric in dbt or not, ask yourself the following: + +> *Is this something our teams consistently need to report on?* + +An important business metric should be: + +- Well-defined (the definition is agreed upon throughout the entire organization) +- Time-bound (able to be compared across time) + +A great example of this is **revenue**. It can be aggregated on multiple levels (weekly, monthly, and so on) and is key for the broader business to understand. + +- ✅ `Monthly recurring revenue` or `Weekly active users` or `Average order value` +- ❌ `1-off experimental metric` + + +### Design and define metrics + +You can design and define your metrics in `.yml` files nested under a metrics key in your dbt project. For more information, refer to these docs:
          + +- [dbt metrics](docs/build/metrics) for in-depth detail on attributes, filters, how to define and query your metrics, and [dbt-metrics package](https://github.com/dbt-labs/dbt_metrics) +- [dbt Semantic Layer quickstart](/docs/use-dbt-semantic-layer/quickstart-semantic-layer) to get started + +## Related questions + +
          + How do I migrate from the legacy Semantic Layer to the new one? +
          +
          If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
          +
          +
          + +
          + How are you storing my data? +
          +
          The dbt Semantic Layer doesn't store, cache, or log your data. On each query to the Semantic Layer, the resulting data passes through dbt Cloud servers where it's never stored, cached, or logged. The data from your data platform gets routed through dbt Cloud servers to your connecting data tool.
          +
          +
          +
          + Is the dbt Semantic Layer open source? +
          +
          Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open source.



          + +During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers — Developer, Team, and Enterprise.



          + +
          +
          +
          + Is there a dbt Semantic Layer discussion hub? +
          +
          Yes, absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
          +
          +
          +

          +
          diff --git a/website/docs/docs/use-dbt-semantic-layer/gsheets.md b/website/docs/docs/use-dbt-semantic-layer/gsheets.md new file mode 100644 index 00000000000..2140becc9d8 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/gsheets.md @@ -0,0 +1,61 @@ +--- +title: "Google Sheets (beta)" +description: "Integrate with Google Sheets to query your metrics in a spreadsheet." +tags: [Semantic Layer] +sidebar_label: "Google Sheets (beta)" +--- + +:::info Beta functionality +Google Sheets integration with the dbt Semantic Layer is a [beta](/docs/dbt-versions/product-lifecycles#dbt-cloud) feature. +::: + +The dbt Semantic Layer offers a seamless integration with Google Sheets through a custom menu. This add-on allows you to build dbt Semantic Layer queries and return data on your metrics directly within Google Sheet. + +## Prerequisites + +1. You have a Google account with access to Google Sheets. +2. You can install Google add-ons. +3. You have [set up the dbt Semantic Layer](/docs/use-dbt-semantic-layer/setup-sl). +4. You have a dbt Cloud Environment ID and a [service token](/docs/dbt-cloud-apis/service-tokens) to authenticate with from a dbt Cloud account. + +## Installing the add-on + +1. Navigate to the [dbt Semantic Layer for Sheets App](https://gsuite.google.com/marketplace/app/foo/392263010968) to install the add-on. + + - You can also find it in Google Sheets by going to [**Extensions -> Add-on -> Get add-ons**](https://support.google.com/docs/answer/2942256?hl=en&co=GENIE.Platform%3DDesktop&oco=0#zippy=%2Cinstall-add-ons%2Cinstall-an-add-on) and searching for it there. +2. After installing, open the Add-On menu and select the "dbt Semantic Layer for Sheets". This will open a custom menu to the right-hand side of your screen. +3. Authenticate with your Host, dbt Cloud Environment ID, and Service Token. +4. Start querying your metrics using the **Query Builder**. For more info on the menu functions, refer to [Custom menu functions](#custom-menu-functions). + +When querying your data with Google Sheets: + +- It returns the data to the cell you have clicked on. +- The custom menu operation has a timeout limit of six (6) minutes. + +## Custom menu functions + +The custom menu provides the following capabilities: + +| Menu items | Description | +|---------------|-------------------------------------------------------| +| Metrics | Search and select metrics. | +| Group By | Search and select dimensions to group by. Dimensions are grouped by the entity of the semantic model they come from. | +| Granularity | Modify the granularity of the primary time dimension. | +| Where | Filter your data. This includes categorical and time filters. | +| Order By | Return your data ordered. | +| Limit | Set a limit for the rows of your output. | + + +## Filtering data + +To use the filter functionality, choose the [dimension](docs/build/dimensions) you want to filter by and select the operation you want to filter on. + - For categorical dimensiosn, type in the dimension value you want to filter by (no quotes needed) and press enter. + - Continue adding additional filters as needed with AND and OR. If it's a time dimension, choose the operator and select from the calendar. + + + +**Limited Use Policy Disclosure** + +The dbt Semantic Layer for Sheet's use and transfer to any other app of information received from Google APIs will adhere to [Google API Services User Data Policy](https://developers.google.com/terms/api-services-user-data-policy), including the Limited Use requirements. + + diff --git a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md similarity index 60% rename from website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md rename to website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md index 2ed60e32ded..d0e5df18d94 100644 --- a/website/docs/docs/use-dbt-semantic-layer/quickstart-semantic-layer.md +++ b/website/docs/docs/use-dbt-semantic-layer/quickstart-sl.md @@ -1,79 +1,148 @@ --- -title: "Quickstart" -id: quickstart-semantic-layer -description: "Define metrics and set up the dbt Semantic Layer" -sidebar_label: "Quickstart" +title: "Get started with the dbt Semantic Layer" +id: quickstart-sl +description: "Use this guide to build and define metrics, set up the dbt Semantic Layer, and query them using the Semantic Layer APIs." +sidebar_label: "Get started with the dbt Semantic Layer" +tags: [Semantic Layer] +meta: + api_name: dbt Semantic Layer APIs --- -# dbt Semantic Layer quickstart + - -## Public Preview - -We're excited to announce the dbt Semantic Layer is currently available for Public Preview, which means: +import CreateModel from '/snippets/_sl-create-semanticmodel.md'; +import DefineMetrics from '/snippets/_sl-define-metrics.md'; +import ConfigMetric from '/snippets/_sl-configure-metricflow.md'; +import TestQuery from '/snippets/_sl-test-and-query-metrics.md'; +import ConnectQueryAPI from '/snippets/_sl-connect-and-query-api.md'; +import RunProdJob from '/snippets/_sl-run-prod-job.md'; + + +The dbt Semantic Layer, powered by [MetricFlow](/docs/build/about-metricflow), simplifies defining and using critical business metrics. It centralizes metric definitions, eliminates duplicate coding, and ensures consistent self-service access to metrics in downstream tools. + +MetricFlow, a powerful component of the dbt Semantic Layer, simplifies the creation and management of company metrics. It offers flexible abstractions, SQL query generation, and enables fast retrieval of metric datasets from a data platform. + +Use this guide to fully experience the power of the universal dbt Semantic Layer. Here are the following steps you'll take: + +- [Create a semantic model](#create-a-semantic-model) in dbt Cloud using MetricFlow +- [Define metrics](#define-metrics) in dbt Cloud using MetricFlow +- [Test and query metrics](#test-and-query-metrics) with MetricFlow +- [Run a production job](#run-a-production-job) in dbt Cloud +- [Set up dbt Semantic Layer](#setup) in dbt Cloud +- [Connect and query API](#connect-and-query-api) with dbt Cloud + +MetricFlow allows you to define metrics in your dbt project and query them whether in dbt Cloud or dbt Core with [MetricFlow commands](/docs/build/metricflow-commands). + +However, to experience the power of the universal [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and query those metrics in downstream tools, you'll need a dbt Cloud [Team or Enterprise](https://www.getdbt.com/pricing/) account. + +## Prerequisites + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +:::tip +New to dbt or metrics? Try our [Jaffle shop example project](https://github.com/dbt-labs/jaffle-sl-template) to help you get started! +::: + +## Create a semantic model + + + +## Define metrics + + -— **Who?** The dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise) during Public Preview. Review [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) for more info on plan availability. +## Test and query metrics -- Team and Enterprise accounts will be able to set up the Semantic Layer and [Metadata API](/docs/dbt-cloud-apis/metadata-api) in the integrated -partner tool to import metric definition. -- Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse dbt metrics in external tools, which requires access to the Metadata API. + -— **What?** Public Previews provide early access to new features. The Semantic Layer is stable and you can use it for production deployments, but there may still be some planned additions and modifications to product behaviors before moving to General Availability. We may also introduce new functionality that is not backwards compatible. dbt Labs provides support, and relevant service level objectives (SLOs) apply. We will introduce pricing for the dbt Semantic Layer alongside the General Available (GA) release (future GA date to be announced). +## Run a production job + + + + + +
          + +What’s happening internally? +- Merging the code into your main branch allows dbt Cloud to pull those changes and builds the definition in the manifest produced by the run.
          +- Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date.
          +- The Semantic Layer APIs pulls in the most recent manifest and allows your integration information to extract metadata from it. +
          + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + + +## Connect and query API + + + + +## FAQs + +If you're encountering some issues when defining your metrics or setting up the dbt Semantic Layer, check out a list of answers to some of the questions or problems you may be experiencing. -— **When?** Public Preview will end once the dbt Semantic Layer is available for GA. After GA, the dbt Semantic Layer will only be available to dbt Cloud **Team** and **Enterprise** plans. +
          + How do I migrate from the legacy Semantic Layer to the new one? +
          +
          If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
          +
          +
          +
          +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
          +
          +Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

          dbt Cloud Developer or dbt Core users can define metrics in their project, including a local dbt Core project, using the dbt Cloud IDE or the MetricFlow CLI. However, to experience the universal dbt Semantic Layer and access those metrics using the API or downstream tools, users will must be on a dbt Cloud Team or Enterprise plan.
          +

          + +## Next steps -— **Where?** Public Preview is enabled at the account level so you don’t need to worry about enabling it per user. +- [Build your metrics](/docs/build/build-metrics-intro) +- [Set up dbt Semantic Layer](docs/use-dbt-semantic-layer/setup-dbt-sl) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +
          + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; -## Introduction + To try out the features of the dbt Semantic Layer, you first need to have a dbt project set up. This quickstart guide will lay out the following steps, and recommends a workflow that demonstrates some of its essential features: -- Install dbt metrics package +- Install dbt metrics package + * Note: this package will be deprecated very soon and we highly recommend you to use the new [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl?version=1.6), available in dbt v 1.6 or higher. - Define metrics - Query, and run metrics - Configure the dbt Semantic Layer ## Prerequisites -To use the dbt Semantic Layer, you’ll need to meet the following: - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.3 or higher
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=1.3.0", "<1.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but won't be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          - -
          - - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.2 (latest)
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=0.3.0", "<0.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but won't be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          +To use the dbt Semantic Layer, you’ll need to meet the following: -
          + - + :::info 📌 -New to dbt or metrics? Check out our [Getting Started guide](/docs/get-started/getting-started/overview) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. +New to dbt or metrics? Check out our [quickstart guide](/quickstarts) to build your first dbt project! If you'd like to define your first metrics, try our [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example project. ::: ## Installing dbt metrics package -The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. +The dbt Semantic Layer supports the calculation of metrics by using the [dbt metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/). You can install the dbt metrics package in your dbt project by copying the below code blocks. @@ -95,16 +164,6 @@ packages: - - -```yml -packages: - - package: dbt-labs/metrics - version: [">=0.2.0", "<0.3.0"] -``` - - - 1. Paste the dbt metrics package code in your `packages.yml` file. 2. Run the [`dbt deps` command](/reference/commands/deps) to install the package. @@ -119,11 +178,6 @@ Review our helpful metrics video below, which explains what metrics are, why the -### Design metrics - -To read about best practices on structuring and organizing your metrics, review our [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog post first. - -### Define metrics Now that you've organized your metrics folder and files, you can define your metrics in `.yml` files nested under a `metrics` key. 1. Add the metric definitions found in the [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop_metrics) example to your dbt project. For example, to add an expenses metric, reference the following metrics you can define directly in your metrics folder: @@ -194,9 +248,7 @@ metrics: 2. Commit and merge the code changes that contain the metric definitions. 3. If you'd like to further design and define your own metrics, review the following documentation: - - [dbt metrics](/docs/build/metrics) will povide you in-depth detail on attributes, properties, filters, and how to define and query metrics. - - - Review [How to design and structure dbt metrics: Recommendations for getting started](https://docs.getdbt.com/blog/how-to-design-and-structure-metrics) blog to understand best practices for designing and structuring metrics in your dbt project. + - [dbt metrics](/docs/build/metrics) will provide you in-depth detail on attributes, properties, filters, and how to define and query metrics. ## Develop and query metrics @@ -224,11 +276,11 @@ Once you’ve defined metrics in your dbt project, you can perform a job run in - Merging the code into your main branch allows dbt Cloud to pull those changes and builds the definition in the manifest produced by the run. - Re-running the job in the deployment environment helps materialize the models, which the metrics depend on, in the data platform. It also makes sure that the manifest is up to date. -- Your dbt Metadata API pulls in the most recent manifest and allows your integration information to extract metadata from it. +- Your dbt Discovery API pulls in the most recent manifest and allows your integration information to extract metadata from it. ## Set up dbt Semantic Layer - + ## Troubleshooting @@ -244,13 +296,13 @@ If you're encountering some issues when defining your metrics or setting up the
          Is the dbt Semantic Layer open source?
          -
          Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Metadata API are not open source.



          +
          Some components of the dbt Semantic Layer are open source like dbt-core, the dbt_metrics package, and the BSL-licensed dbt-server. The dbt Proxy Server (what is actually compiling the dbt code) and the Discovery API are not open sources.



          During Public Preview, the dbt Semantic Layer is open to all dbt Cloud tiers (Developer, Team, and Enterprise).



          • dbt Core users can define metrics in their dbt Core projects and calculate them using macros from the metrics package. To use the dbt Semantic Layer integrations, you will need to have a dbt Cloud account.




          • -
          • Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API.




          • -
          • Team and Enterprise accounts will be able to set up the Semantic Layer and Metadata API in the integrated partner tool to import metric definitions.
          • +
          • Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Discovery API.




          • +
          • Team and Enterprise accounts will be able to set up the Semantic Layer and Discovery API in the integrated partner tool to import metric definitions.
          @@ -313,9 +365,7 @@ The reason you're experiencing this error is because we changed the type diff --git a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md b/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md deleted file mode 100644 index 90fabea0bca..00000000000 --- a/website/docs/docs/use-dbt-semantic-layer/set-dbt-semantic-layer.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "Set up the dbt Semantic Layer" -id: setup-dbt-semantic-layer -description: "You can set up the dbt Semantic Layer in dbt Cloud." -sidebar_label: "Set up the dbt Semantic Layer" ---- - -With the dbt Semantic Layer, you'll be able to centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. - -## Prerequisites - -Before you set up the dbt Semantic Layer, make sure you meet the following: - - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.3 or higher
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=1.3.0", "<1.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          - -
          - - - -- Have a multi-tenant dbt Cloud account, hosted in North America
          -- Have both your production and development environments running dbt version 1.2 (latest)
          -- Use Snowflake data platform
          -- Install the dbt metrics package version ">=0.3.0", "<0.4.0" in your dbt project
          -- Set up the Metadata API in the integrated tool to import metric definitions - * Developer accounts will be able to query the Proxy Server using SQL, but will not be able to browse pre-populated dbt metrics in external tools, which requires access to the Metadata API
          -- Recommended - Review the dbt metrics page and Understanding the components of the dbt Semantic Layer blog
          - -
          - - - - - -## Set up dbt Semantic Layer - - - -
          - - -## Related docs - -- [Integrated partner tools](https://www.getdbt.com/product/semantic-layer-integrations) for info on the different integration partners and their documentation -- [Product architecture](/docs/use-dbt-semantic-layer/dbt-semantic-layer#product-architecture) page for more information on plan availability -- [dbt metrics](/docs/build/metrics) for in-depth detail on attributes, properties, filters, and how to define and query metrics -- [dbt Server repo](https://github.com/dbt-labs/dbt-server), which is a persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations diff --git a/website/docs/docs/use-dbt-semantic-layer/setup-sl.md b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md new file mode 100644 index 00000000000..4c88ee50b25 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/setup-sl.md @@ -0,0 +1,99 @@ +--- +title: "Set up the dbt Semantic Layer" +id: setup-sl +description: "Seamlessly set up the dbt Semantic Layer in dbt Cloud using intuitive navigation." +sidebar_label: "Set up your Semantic Layer" +tags: [Semantic Layer] +--- + + + + +With the dbt Semantic Layer, you can centrally define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + +import SetUp from '/snippets/_v2-sl-prerequisites.md'; + + + +## Set up dbt Semantic Layer + +import SlSetUp from '/snippets/_new-sl-setup.md'; + + + + + + + + + +import LegacyInfo from '/snippets/_legacy-sl-callout.md'; + + + +With the dbt Semantic Layer, you can define business metrics, reduce code duplication and inconsistency, create self-service in downstream tools, and more. Configure the dbt Semantic Layer in dbt Cloud to connect with your integrated partner tool. + +## Prerequisites + + + + +## Set up dbt Semantic Layer + +:::tip +If you're using the legacy Semantic Layer, dbt Labs strongly recommends that you [upgrade your dbt version](/docs/dbt-versions/upgrade-core-in-cloud) to dbt v1.6 or higher to use the latest dbt Semantic Layer. Refer to the dedicated [migration guide](/guides/migration/sl-migration) for more info. + +::: + + * Team and Enterprise accounts can set up the Semantic Layer and [Discovery API](/docs/dbt-cloud-apis/discovery-api) in the integrated partner tool to import metric definitions. + * Developer accounts can query the Proxy Server using SQL but won't be able to browse dbt metrics in external tools, which requires access to the Discovery API. + + +1. Log in to your dbt Cloud account. +2. Go to **Account Settings**, and then **Service Tokens** to create a new [service account API token](/docs/dbt-cloud-apis/service-tokens). Save your token somewhere safe. +3. Assign permissions to service account tokens depending on the integration tool you choose. Refer to the [integration partner documentation](https://www.getdbt.com/product/semantic-layer-integrations) to determine the permission sets you need to assign. +4. Go to **Deploy** > **Environments**, and select your **Deployment** environment. +5. Click **Settings** on the top right side of the page. +6. Click **Edit** on the top right side of the page. +7. Select dbt version 1.2 or higher. +8. Toggle the Semantic Layer **On**. +9. Copy the full proxy server URL (like `https://eagle-hqya7.proxy.cloud.getdbt.com`) to connect to your [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations). +10. Use the URL in the data source configuration of the integrated partner tool. +11. Use the data platform login credentials that make sense for how the data is consumed. + +:::info📌 + +It is _not_ recommended that you use your dbt Cloud credentials due to elevated permissions. Instead, you can use your specific integration tool permissions. + +::: + +12. Set up the [Discovery API](/docs/dbt-cloud-apis/discovery-api) (Team and Enterprise accounts only) in the integrated partner tool to import the metric definitions. The [integrated partner tool](https://www.getdbt.com/product/semantic-layer-integrations) will treat the dbt Server as another data source (like a data platform). This requires: + +- The account ID, environment ID, and job ID (which is visible in the job URL) +- An [API service token](/docs/dbt-cloud-apis/service-tokens) with job admin and metadata permissions +- Add the items above to the relevant fields in your integration tool + + +
          + +
          + +## Related docs + +- [Build your metrics](/docs/build/build-metrics-intro) +- [Available integrations](/docs/use-dbt-semantic-layer/avail-sl-integrations) +- [Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) +- [Migrate your legacy Semantic Layer](/guides/migration/sl-migration) +- [Get started with the dbt Semantic Layer](/docs/use-dbt-semantic-layer/quickstart-sl) diff --git a/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md new file mode 100644 index 00000000000..dc75016eb91 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/sl-architecture.md @@ -0,0 +1,77 @@ +--- +title: "dbt Semantic Layer architecture" +id: sl-architecture +description: "dbt Semantic Layer product architecture and related questions." +sidebar_label: "Architecture" +tags: [Semantic Layer] +pagination_next: null +--- + + + + +The dbt Semantic Layer allows you to define metrics and use various interfaces to query them. The Semantic Layer does the heavy lifting to find where the queried data exists in your data platform and generates the SQL to make the request (including performing joins). + + + +## dbt Semantic Layer components + +The dbt Semantic Layer includes the following components: + + +| Components | Information | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[MetricFlow](/docs/build/about-metricflow)** | MetricFlow in dbt allows users to centrally define their semantic models and metrics with YAML specifications. | ✅ | ✅ | ✅ | BSL package (code is source available) | +| **MetricFlow Server**| A proprietary server that takes metric requests and generates optimized SQL for the specific data platform. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| +| **Semantic Layer Gateway** | A service that passes queries to MetricFlow server and executes the SQL generated by MetricFlow against the data platform|

          ❌| ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **Semantic Layer APIs** | The interfaces that allow users to submit metric queries using the GraphQL and JDBC APIs. They also serve as the foundation for building first-class integrations with various tools. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise)| + + +## Related questions + +
          + How do I migrate from the legacy Semantic Layer to the new one? +
          +
          If you're using the legacy Semantic Layer, we highly recommend you upgrade your dbt version to dbt v1.6 or higher to use the new dbt Semantic Layer. Refer to the dedicated migration guide for more info.
          +
          +
          + +
          +How are you storing my data? +User data passes through the Semantic Layer on its way back from the warehouse. dbt Labs ensures security by authenticating through the customer's data warehouse. Currently, we don't cache data for the long term, but it might temporarily stay in the system for up to 10 minutes, usually less. In the future, we'll introduce a caching feature that allows us to cache data on our infrastructure for up to 24 hours. +
          +
          +Is the dbt Semantic Layer open source? +The dbt Semantic Layer is proprietary, however, some components of the dbt Semantic Layer are open source, like dbt-core and MetricFlow.

          The universal dbt Semantic Layer is available to all Team and Enterprise Plans during public beta. Users on dbt Cloud Developer plans or dbt Core users can use MetricFlow to only define and test metrics locally.
          +
          + Is there a dbt Semantic Layer discussion hub? +
          +
          Yes absolutely! Join the dbt Slack community and #dbt-cloud-semantic-layer slack channel for all things related to the dbt Semantic Layer. +
          +
          +
          + +
          + + + +## Product architecture + +The dbt Semantic Layer product architecture includes four primary components: + +| Components | Information | Developer plans | Team plans | Enterprise plans | License | +| --- | --- | :---: | :---: | :---: | --- | +| **[dbt project](/docs/build/metrics)** | Define models and metrics in dbt Core.
          *Note, we will deprecate and no longer support the dbt_metrics package. | ✅ | ✅ | ✅ | Open source, Core | +| **[dbt Server](https://github.com/dbt-labs/dbt-server)**| A persisted HTTP server that wraps dbt core to handle RESTful API requests for dbt operations. | ✅ | ✅ | ✅ | BSL | +| **SQL Proxy** | Reverse-proxy that accepts dbt-SQL (SQL + Jinja like query models and metrics, use macros), compiles the query into pure SQL, and executes the query against the data platform. | ✅

          _* Available during Public Preview only_ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | +| **[Discovery API](/docs/dbt-cloud-apis/discovery-api)** | Accesses metric definitions primarily via integrations and is the source of truth for objects defined in dbt projects (like models, macros, sources, metrics). The Discovery API is updated at the end of every dbt Cloud run. | ❌ | ✅ | ✅ | Proprietary, Cloud (Team & Enterprise) | + + + +dbt Semantic Layer integrations will: + +- Leverage the Discovery API to fetch a list of objects and their attributes, like metrics +- Generate a dbt-SQL statement +- Then query the SQL proxy to evaluate the results of this statement + +
          diff --git a/website/docs/docs/use-dbt-semantic-layer/tableau.md b/website/docs/docs/use-dbt-semantic-layer/tableau.md new file mode 100644 index 00000000000..c505ea323f4 --- /dev/null +++ b/website/docs/docs/use-dbt-semantic-layer/tableau.md @@ -0,0 +1,67 @@ +--- +title: "Tableau (beta)" +description: "Use Tableau worksheets to query the dbt Semantic Layer and produce dashboards with trusted date." +tags: [Semantic Layer] +sidebar_label: "Tableau (beta)" +--- + +:::info Beta functionality +The Tableau integration with the dbt Semantic Layer is a [beta feature](/docs/dbt-versions/product-lifecycles#dbt-cloud). +::: + + +The Tableau integration allows you to use worksheets to query the Semantic Layer directly and produce your dashboards with trusted data. This integration provides a live connection to the dbt Semantic Layer through Tableau Desktop. + +## Prerequisites + +1. You must have [Tableau Desktop](https://www.tableau.com/en-gb/products/desktop) installed +2. Authenticate with either Tableau Server or Tableau Cloud +3. You need your dbt Cloud host, [Environment ID](/docs/use-dbt-semantic-layer/setup-sl#set-up-dbt-semantic-layer) and [service token](/docs/dbt-cloud-apis/service-tokens) to log in. This account should be set up with the dbt Semantic Layer. +4. You must a dbt Cloud Team or Enterprise [account](https://www.getdbt.com/pricing) and multi-tenant [deployment](/docs/cloud/about-cloud/regions-ip-addresses). (Single-Tenant coming soon) + + +## Installing + +1. Download our [connector file](https://github.com/dbt-labs/semantic-layer-tableau-connector/releases/download/v1.0.0/dbt_semantic_layer.taco) locally and add it to your default folder: + - Windows: `C:\Users\\[Windows User]\Documents\My Tableau Repository\Connectors` + - Mac: `/Users/[user]/Documents/My Tableau Repository/Connectors` + - Linux: `/opt/tableau/connectors` +2. Install the [JDBC driver](/docs/dbt-cloud-apis/sl-jdbc) to the folder based on your operating system: + - Windows: `C:\Program Files\Tableau\Drivers` + - Mac: `~/Library/Tableau/Drivers` + - Linux: ` /opt/tableau/tableau_driver/jdbc` +3. Open Tableau Desktop and find the **dbt Semantic Layer by dbt Labs** connector on the left-hand side. +4. Connect with your Host, Environment ID, and service token information that's provided to you in your dbt Cloud Semantic Layer configuration. + + +## Using the integration + +Once you authenticate, the system will direct you to the data source page with all the metrics and dimensions configured in your Semantic Layer. + +- From there, go directly to a worksheet in the bottom left hand corner. +- Then, you'll find all the metrics and dimensions that are available to query on the left-hand side of your window. + +Visit the [Tableau documentation](https://help.tableau.com/current/pro/desktop/en-us/gettingstarted_overview.htm) to learn more about how to use Tableau worksheets and dashboards. + +## Things to note + +- All metrics use the "SUM" aggregation type, and this can't be altered. The dbt Semantic Layer controls the aggregation type and it is intentionally fixed. Keep in mind that the underlying aggregation in the dbt Semantic Layer might not be "SUM" (even though "SUM" is Tableau's default). +- Tableau surfaces all metrics and dimensions from the dbt Semantic Layer on the left-hand side. Note, that not all metrics and dimensions can be combined with one another. You will receive an error message if a particular dimension cannot be sliced with a metric (or vice versa). + - To display available metrics and dimensions, dbt Semantic Layer returns metadata for a fake table with the dimensions and metrics as 'columns' on this table. Because of this, you can't actually query this table for previews or extracts. + - Since this is treated as a table, dbt Semantic Layer can't dynamically change what is available. This means we display _all_ available metrics and dimensions even if a particular metric and dimension combination isn't available. + +- Certain Table calculations like "Totals" and "Percent Of" may not be accurate when using metrics aggregated in a non-additive way (such as count distinct) + +## Unsupported functionality + +The following Tableau features aren't supported at this time, however, the dbt Semantic Layer may support some of this functionality in a future release: + +- Updating the data source page +- Using "Extract" mode to view yur data +- Unioning Tables +- Writing Custom SQL +- Table Extensions +- Cross Database Joins +- All functions in Analysis --> Create Calculated Field +- Filtering on a Date Part time dimension for a Cumulative metric type +- Changing your date dimension to use "Week Number" diff --git a/website/docs/docs/verified-adapters.md b/website/docs/docs/verified-adapters.md new file mode 100644 index 00000000000..170bc8f885b --- /dev/null +++ b/website/docs/docs/verified-adapters.md @@ -0,0 +1,24 @@ +--- +title: "Verified adapters" +id: "verified-adapters" +hide_table_of_contents: true +--- + + +The dbt Labs has a rigorous verified adapter program that provides reassurance to users about which adapters can be trusted to use in production, has been tested, and is actively maintained and updated. The process covers development, documentation, user experience, and maintenance aspects. + +These adapters then earn a "Verified" status so that users can have a certain level of trust and expectation when they use them. The adapters also have maintainers and we recommend using the adapter's verification status to determine its quality and health. + +The verification process serves as the on-ramp to integration with dbt Cloud. As such, we restrict applicants to data platform vendors with whom we are already engaged. + +To learn more, see [Verifying a new adapter](/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter). + +import MSCallout from '/snippets/_microsoft-adapters-soon.md'; + + + +Here are the verified data platforms that connect to dbt and its latest version. + +import AdaptersVerified from '/snippets/_adapters-verified.md'; + + diff --git a/website/docs/faqs/API/rotate-token.md b/website/docs/faqs/API/rotate-token.md new file mode 100644 index 00000000000..a880825ea3f --- /dev/null +++ b/website/docs/faqs/API/rotate-token.md @@ -0,0 +1,43 @@ +--- +title: How can I rotate my user API token? +description: "Instructions on how to rotate API token" +sidebar_label: 'Rotate your user API token' +id: rotate-token +--- + +For security reasons and best practices, you should aim to rotate API keys every so often. + +1. Rotate your [User API token](/docs/dbt-cloud-apis/user-tokens) by replacing `YOUR_USER_ID`, `YOUR_CURRENT_TOKEN`, and `YOUR_ACCESS_URL `with your information in the following request. + +``` +curl --location --request POST 'https://YOUR_ACCESS_URL/api/v2/users/YOUR_USER_ID/apikey/' \ +--header 'Authorization: Token YOUR_CURRENT_TOKEN' +``` + +* Find your `YOUR_USER_ID` by reading [How to find your user ID](/faqs/Accounts/find-user-id). +* Find your `YOUR_CURRENT_TOKEN` by going to **Profile Settings** -> **API Access** and copying the API key. +* Find [`YOUR_ACCESS_URL`](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. + +:::info Example + +If `YOUR_USER_ID` = `123`, `YOUR_CURRENT_TOKEN` = `abcf9g`, and your `ACCESS_URL` = `cloud.getdbt.com`, then your curl request will be: + +``` +curl --location --request POST 'https://cloud.getdbt.com/api/v2/users/123/apikey/' \ +--header 'Authorization: Token abcf9g' +``` +::: + +2. Find the new key in the API response or in dbt Cloud. + +3. To find the new key in dbt Cloud, go to **Profile Settings** -> **API Access**. + + +### dbt Cloud deployments + +If your [dbt Cloud deployment](/docs/cloud/about-cloud/regions-ip-addresses) uses a different access URL, replace `cloud.getdbt.com` with the URL of your instance. + +For example, if your deployment is Virtual Private dbt: + +✅ `http://cloud.customizedurl.getdbt.com/`
          +❌ `http://cloud.getdbt.com/`
          diff --git a/website/docs/faqs/Accounts/change-users-license.md b/website/docs/faqs/Accounts/change-users-license.md new file mode 100644 index 00000000000..8755b946126 --- /dev/null +++ b/website/docs/faqs/Accounts/change-users-license.md @@ -0,0 +1,19 @@ +--- +title: How do I change a user license type to read-only in dbt Cloud? +description: "Changing a user license type to read-only in dbt Cloud" +sidebar_label: 'How to change a user license type to read-only' +id: change-user-license + +--- + +To change the license type for a user from `developer` to `read-only` or `IT` in dbt Cloud, you must be an account owner or have admin privileges. You might make this change to free up a billable seat but retain the user’s access to view the information in the dbt Cloud account. + +1. From dbt Cloud, click the gear icon at the top right and select **Account Settings**. + + + +2. In **Account Settings**, select **Users** under **Teams**. +3. Select the user you want to remove, and click **Edit** in the bottom of their profile. +4. For the **License** option, choose **Read-only** or **IT** (from **Developer**), and click **Save**. + + diff --git a/website/docs/faqs/Accounts/cloud-upgrade-instructions.md b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md new file mode 100644 index 00000000000..f8daf393f9b --- /dev/null +++ b/website/docs/faqs/Accounts/cloud-upgrade-instructions.md @@ -0,0 +1,90 @@ +--- +title: "How to upgrade a dbt Cloud account" +id: "cloud-upgrade-instructions" +description: "Instructions for upgrading a dbt Cloud account after the trial ends." +--- + +dbt Cloud offers [several plans](https://www.getdbt.com/pricing/) with different features that meet your needs. This document is for dbt Cloud admins and explains how to select a plan in order to continue using dbt Cloud. + +:::tip Before you begin +- You **_must_** be part of the [Owner](/docs/cloud/manage-access/self-service-permissions) user group to make billing changes. Users not included in this group will not see these options. +- All amounts shown in dbt Cloud are in U.S. Dollars (USD) +- When your trial expires, your account's default plan enrollment will be a Team plan. +::: + + +## Select a plan +When your [14 day trial](https://www.getdbt.com/signup/) ends or if your subscription payment is past due , you'll need to select a plan in order to continue using your account: + +- Upon logging in, you should see an "Account locked" pop up message with instructions to unlock your account and update your payment details +- Click **Go to Billing** to go to the billing page +- Under **Billing**, you can review the available dbt Cloud [plans](https://www.getdbt.com/pricing/) and their features + + +To unlock your account and select a plan, review the following guidance per plan type: + +### Developer plan + +1. To select a Developer plan, choose the Developer tab under **Billing**. +2. Click **Select Plan** on the right. +3. Confirm your plan selection on the pop up message. +4. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Developer plan. 🎉 + + + +### Team plan + +1. When your trial expires, your account's default plan enrollment will be a Team plan. +2. To unlock your account and continue using the Team plan, you need to enter your payment details. +3. Go to **Payment Information** and click **Edit** on the right. +4. Enter your payment details and click **Save**. +5. This automatically unlocks your dbt Cloud account, and you can now enjoy the benefits of the Team plan. 🎉 + + + +### Enterprise plan + +1. If you're interested in our Enterprise plan, select the Enterprise tab under **Billing**. +2. Click **Contact Sales** on the right. This opens a chat window for you to contact the dbt Cloud Support team, who will connect you to our Sales team. +3. Once you submit your request, our Sales team will contact you with more information. + + + +4. Alternatively, you can [contact](https://www.getdbt.com/contact/) our Sales team directly to chat about how dbt Cloud can help you and your team. + +## Related questions + +For commonly asked billings questions, refer to the dbt Cloud [pricing page](https://www.getdbt.com/pricing/). + +
          + How does billing work? +
          +
          Team plans are billed monthly on the credit card used to sign up, based on [developer seat count and usage](/docs/cloud/billing). You’ll also be sent a monthly receipt to the billing email of your choice. You can change any billing information in your Account Settings > Billing page.



          + + Enterprise plan customers are billed annually based on the number of developer seats, as well as any additional services + features in your chosen plan.
          +
          +
          +
          + Can I upgrade or downgrade my plan? +
          +
          Yes, you can upgrade or downgrade at any time. Account Owners can access their dedicated billing section via the account settings page.



          + + If you’re not sure which plan is right for you, get in touch and we’ll be happy to help you find one that fits your needs.
          +
          +
          +
          + Can I pay by invoice? +
          +
          Currently, dbt Cloud Team plan payments must be made with a credit card, and by default they will be billed monthly based on the number of [developer seats and usage](/docs/cloud/billing).



          + + We don’t have any plans to do invoicing for Team plan accounts in the near future, but we do currently support invoices for companies on the dbt Cloud Enterprise plan. Feel free to contact us to build your Enterprise pricing plan.
          +
          +
          +
          + Why did I receive a Failed payment error email? +
          +
          This means we were unable to charge the credit card you have on file, or you have not provided an updated card for payment. If you're a current account owner with a card on file, contact your credit card issuer to inquire as to why your card was declined or update the credit card on your account.



          + + Your Account Owner can update payment details in the Account Settings -> Billing page. Click Edit next to your card details, double check your information is up-to-date, and we'll give it another go at the next billing run.
          +
          +
          diff --git a/website/docs/faqs/Accounts/delete-users.md b/website/docs/faqs/Accounts/delete-users.md new file mode 100644 index 00000000000..a7e422fd82c --- /dev/null +++ b/website/docs/faqs/Accounts/delete-users.md @@ -0,0 +1,30 @@ +--- +title: How do I delete a user in dbt Cloud? +description: "Deleting a user in dbt Cloud" +sidebar_label: 'How to delete a user' +id: delete-users + +--- + +To delete a user in dbt Cloud, you must be an account owner or have admin privileges. If the user has a `developer` license type, this will open up their seat for another user or allow the admins to lower the total number of seats. + +1. From dbt Cloud, click the gear icon at the top right and select **Account Settings**. + + + +2. In **Account Settings**, select **Users** under **Teams**. +3. Select the user you want to delete, then click **Edit**. +4. Click **Delete** in the bottom left. Click **Confirm Delete** to immediately delete the user without additional password prompts. This action cannot be undone. However, you can re-invite the user with the same information if the deletion was made in error. + + + +If you are on a **Teams** plan and you are deleting users to reduce the number of billable seats, you also need to take these steps to lower the license count: +1. In **Account Settings**, select **Billing**. +2. Enter the number of developer seats you want and make sure you fill in all the payment details, including the **Billing Address** section. If you leave any field blank, you won't be able to save your changes. +3. Click **Update Payment Information** to save your changes. + + + +## Related docs + +- [dbt Cloud licenses](/docs/cloud/manage-access/seats-and-users#licenses) diff --git a/website/docs/faqs/Accounts/find-user-id.md b/website/docs/faqs/Accounts/find-user-id.md new file mode 100644 index 00000000000..7f7eca2cbba --- /dev/null +++ b/website/docs/faqs/Accounts/find-user-id.md @@ -0,0 +1,17 @@ +--- +title: Where can I find my user id? +description: "Instructions on where to find your user id" +sidebar_label: 'Where can I find my user ID' +id: find-user-id +--- + +Knowing your dbt Cloud user ID can help with actions related to [rotating your API token](/faqs/API/rotate-token), interacting with support, and more. + +To find your user ID in dbt Cloud, read the following steps: + +1. Go to **Account Settings**, **Team**, and then **Users**, +2. Select your user,
          +3. In the address bar, the number after `/users` is your user ID. +4. Copy that number or save it somewhere safe.
          + +For example, if the URL is `https://cloud.getdbt.com/settings/accounts/12345/users/67891` — the user ID is `67891`

          \ No newline at end of file diff --git a/website/docs/faqs/Accounts/payment-accepted.md b/website/docs/faqs/Accounts/payment-accepted.md index 2e26063c684..c0e949833a2 100644 --- a/website/docs/faqs/Accounts/payment-accepted.md +++ b/website/docs/faqs/Accounts/payment-accepted.md @@ -5,6 +5,6 @@ sidebar_label: 'Can I pay invoice' id: payment-accepted --- -Presently for Team plans, self-service dbt Cloud payments must be made via credit card and by default, they will be billed monthly based on the number of active developer seats. +Currently for Team plans, self-service dbt Cloud payments must be made with a credit card and by default, they will be billed monthly based on the number of [active developer seats and usage](/docs/cloud/billing). We don't have any plans to do invoicing for self-service teams in the near future, but we *do* currently support invoices for companies on the **dbt Cloud Enterprise plan.** Feel free to [contact us](https://www.getdbt.com/contact) to build your Enterprise pricing. diff --git a/website/docs/faqs/Accounts/slack.md b/website/docs/faqs/Accounts/slack.md index 01001141e2e..4faa60fb09a 100644 --- a/website/docs/faqs/Accounts/slack.md +++ b/website/docs/faqs/Accounts/slack.md @@ -5,4 +5,4 @@ sidebar_label: 'How to set up Slack' id: slack --- - + diff --git a/website/docs/faqs/Accounts/transfer-account.md b/website/docs/faqs/Accounts/transfer-account.md index f3bba49bd7a..d82dfbf505a 100644 --- a/website/docs/faqs/Accounts/transfer-account.md +++ b/website/docs/faqs/Accounts/transfer-account.md @@ -6,14 +6,14 @@ id: transfer-account --- -You can transfer your dbt Cloud [access control](/docs/collaborate/manage-access/about-access) to another user by following the steps below, depending on your dbt Cloud account plan: +You can transfer your dbt Cloud [access control](/docs/cloud/manage-access/about-user-access) to another user by following the steps below, depending on your dbt Cloud account plan: | Account plan| Steps | | ------ | ---------- | -| **Developer** | You can transfer ownership by changing the email directly on your dbt Cloud [profile page](https://cloud.getdbt.com/#/profile/). | +| **Developer** | You can transfer ownership by changing the email directly on your dbt Cloud profile page, which you can access using this URL when you replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: `https://YOUR_ACCESS_URL/settings/profile` | | **Team** | Existing account admins with account access can add users to, or remove users from the owner group. | | **Enterprise** | Account admins can add users to, or remove users from a group with Account Admin permissions. | -| **If all account owners left the company** | If the account owner has left your organization, you will need to work with _your_ IT department to have incoming emails forwarded to the new account owner. Once your IT department has redirected the emails, you can request to reset the user password. Once you log in - you can change the email on the [Profile page](https://cloud.getdbt.com/#/profile/). | +| **If all account owners left the company** | If the account owner has left your organization, you will need to work with _your_ IT department to have incoming emails forwarded to the new account owner. Once your IT department has redirected the emails, you can request to reset the user password. Once you log in, you can change the email on the Profile page when you replace `YOUR_ACCESS_URL` with the [appropriate Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan: `https://YOUR_ACCESS_URL/settings/profile`. | When you make any account owner and email changes: diff --git a/website/docs/faqs/Core/install-pip-os-prereqs.md b/website/docs/faqs/Core/install-pip-os-prereqs.md index 8124eaaa215..41a4e4ec60e 100644 --- a/website/docs/faqs/Core/install-pip-os-prereqs.md +++ b/website/docs/faqs/Core/install-pip-os-prereqs.md @@ -6,7 +6,7 @@ id: install-pip-os-prereqs.md --- -Your operating system may require pre-installation setup before installing dbt Core with pip. After downloading and installing any dependencies specific to your development environment, you can proceed with the [pip installation of dbt Core](/docs/get-started/pip-install). +Your operating system may require pre-installation setup before installing dbt Core with pip. After downloading and installing any dependencies specific to your development environment, you can proceed with the [pip installation of dbt Core](/docs/core/pip-install). ### CentOS @@ -23,7 +23,13 @@ sudo yum install redhat-rpm-config gcc libffi-devel \ ### MacOS + +The MacOS requires Python 3.8 or higher to successfully install and run dbt Core. + + + The MacOS requires Python 3.7 or higher to successfully install and run dbt Core. + To check the Python version: @@ -33,7 +39,7 @@ python --version ``` -If you need a compatible version, you can download and install [Python version 3.7 or higher for MacOS](https://www.python.org/downloads/macos). +If you need a compatible version, you can download and install [Python version 3.8 or higher for MacOS](https://www.python.org/downloads/macos). If your machine runs on an Apple M1 architecture, we recommend that you install dbt via [Rosetta](https://support.apple.com/en-us/HT211861). This is necessary for certain dependencies that are only supported on Intel processors. ### Ubuntu/Debian @@ -55,6 +61,15 @@ pip install cryptography~=3.4 Windows requires Python and git to successfully install and run dbt Core. + + +Install [Git for Windows](https://git-scm.com/downloads) and [Python version 3.8 or higher for Windows](https://www.python.org/downloads/windows/). + + + + + Install [Git for Windows](https://git-scm.com/downloads) and [Python version 3.7 or higher for Windows](https://www.python.org/downloads/windows/). + -For further questions, please see the [Python compatibility FAQ](/docs/faqs/Core/install-python-compatibility) +For further questions, please see the [Python compatibility FAQ](/faqs/Core/install-python-compatibility) diff --git a/website/docs/faqs/Core/install-python-compatibility.md b/website/docs/faqs/Core/install-python-compatibility.md index 9e20cbb0f85..5c536101f0c 100644 --- a/website/docs/faqs/Core/install-python-compatibility.md +++ b/website/docs/faqs/Core/install-python-compatibility.md @@ -1,19 +1,25 @@ --- title: What version of Python can I use? -description: "Python versions 3.7 and newer can be used with dbt Core" +description: "Python versions 3.8 and newer can be used with dbt Core" sidebar_label: 'Python version' id: install-python-compatibility --- - + -The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, and 3.10. +The latest version of `dbt-core` is compatible with Python versions 3.8, 3.9, 3.10 and 3.11. - + -As of v1.0, `dbt-core` is compatible with Python versions 3.7, 3.8, and 3.9. +The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, 3.10 and 3.11. + + + + + +The latest version of `dbt-core` is compatible with Python versions 3.7, 3.8, 3.9, and 3.10 diff --git a/website/docs/faqs/Docs/document-all-columns.md b/website/docs/faqs/Docs/document-all-columns.md index 0de7881bfdd..493054f2800 100644 --- a/website/docs/faqs/Docs/document-all-columns.md +++ b/website/docs/faqs/Docs/document-all-columns.md @@ -1,5 +1,5 @@ --- -title: Do I need to add a yaml entry for column for it to appear in the docs site? +title: Do I need to add a YAML entry for column for it to appear in the docs site? description: "All columns appear in your docs site" sidebar_label: 'Types of columns included in doc site' id: document-all-columns diff --git a/website/docs/faqs/Docs/document-other-resources.md b/website/docs/faqs/Docs/document-other-resources.md index b5d612fbcfc..b52d597aa2c 100644 --- a/website/docs/faqs/Docs/document-other-resources.md +++ b/website/docs/faqs/Docs/document-other-resources.md @@ -5,4 +5,4 @@ sidebar_label: 'Document other resources' id: document-other-resources --- -Yes! You can document almost everything in your project using the `description:` key. Check out the reference docs on [descriptions](description) for more info! +Yes! You can document almost everything in your project using the `description:` key. Check out the reference docs on [descriptions](/reference/resource-properties/description) for more info! diff --git a/website/docs/faqs/Docs/documenting-macros.md b/website/docs/faqs/Docs/documenting-macros.md index 71211f2c100..9a2036cd6bf 100644 --- a/website/docs/faqs/Docs/documenting-macros.md +++ b/website/docs/faqs/Docs/documenting-macros.md @@ -5,13 +5,11 @@ sidebar_label: 'Document macros' id: documenting-macros --- -The `macros:` key is new in 0.16.0. - -To document macros, use a [schema file](macro-properties) and nest the configurations under a `macros:` key +To document macros, use a [schema file](/reference/macro-properties) and nest the configurations under a `macros:` key ## Example - + ```yml version: 2 diff --git a/website/docs/faqs/Docs/long-descriptions.md b/website/docs/faqs/Docs/long-descriptions.md index f353050468e..cdf15a94120 100644 --- a/website/docs/faqs/Docs/long-descriptions.md +++ b/website/docs/faqs/Docs/long-descriptions.md @@ -5,19 +5,30 @@ sidebar_label: 'Write long descriptions' id: long-descriptions --- If you need more than a sentence to explain a model, you can: +1. Split your description over multiple lines using `>`. Interior line breaks are removed and Markdown can be used. This method is recommended for simple, single-paragraph descriptions: +```yml + version: 2 -1. Split your description over multiple lines ([yaml docs](https://yaml-multiline.info/)), like so: + models: + - name: customers + description: > + Lorem ipsum **dolor** sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. +``` - ```yml - version: 2 +2. Split your description over multiple lines using `|`. Interior line breaks are maintained and Markdown can be used. This method is recommended for more complex descriptions: +```yml + version: 2 - models: - - name: customers - description: > - Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod - tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, - quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo - consequat. - ``` + models: + - name: customers + description: | + ### Lorem ipsum -2. Use a [docs block](/documentation#using-docs-blocks) to write the description in a Markdown file. + * dolor sit amet, consectetur adipisicing elit, sed do eiusmod + * tempor incididunt ut labore et dolore magna aliqua. +``` + +3. Use a [docs block](/docs/collaborate/documentation#using-docs-blocks) to write the description in a separate Markdown file. diff --git a/website/docs/faqs/Docs/modify-owner-column.md b/website/docs/faqs/Docs/modify-owner-column.md new file mode 100644 index 00000000000..8395a182bb9 --- /dev/null +++ b/website/docs/faqs/Docs/modify-owner-column.md @@ -0,0 +1,15 @@ +--- +title: How do I populate the owner column in the generated docs? +description: "Modify owner column" +sidebar_label: 'Can I populate owner column in docs?' +id: modify-owner-column +--- + + +Due to the nature of the field, you won't be able to change the owner column in your generated documentation. + +The _owner_ field in `dbt-docs` is pulled from database metadata (`catalog.json`), meaning the owner of that table in the database. With the exception of exposures, it's not pulled from an `owner` field set within dbt. + +Generally, dbt's database user owns the tables created in the database. Source tables are usually owned by the service responsible for ingesting/loading them. + +If you set `meta.owner`, you should now be seeing that field appear under **meta** (pulled from dbt), but still not under the top-level **owner** field. diff --git a/website/docs/faqs/Environments/beta-release.md b/website/docs/faqs/Environments/beta-release.md deleted file mode 100644 index 5eef07d3510..00000000000 --- a/website/docs/faqs/Environments/beta-release.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: What is a beta release? -description: "How to try out beta features" -sidebar_label: 'What is a beta release?' -id: beta-release ---- -This is a chance to try out brand-new functionality. You get to start planning for use cases that the next minor version will unlock. We get to hear from you about unexpected behavior and nasty bugs, so that the release candidate has more polish and fewer surprises. diff --git a/website/docs/faqs/Environments/custom-branch-settings.md b/website/docs/faqs/Environments/custom-branch-settings.md new file mode 100644 index 00000000000..95929d2d393 --- /dev/null +++ b/website/docs/faqs/Environments/custom-branch-settings.md @@ -0,0 +1,26 @@ +--- +title: How do I use the `Custom Branch` settings in a dbt Cloud Environment? +description: "Use custom code from your repository" +sidebar_label: 'Custom Branch settings' +id: custom-branch-settings +--- + +In dbt Cloud environments, you can change your git settings to use a different branch in your dbt project repositories besides the default branch. When you make this change, you run dbt on a custom branch. When specified, dbt Cloud executes models using the custom branch setting for that environment. Development and deployment environments have slightly different effects. + +To specify a custom branch: +1. Edit an existing environment or create a new one +2. Select **Only run on a custom branch** under General Settings +3. Specify the **branch name or tag** + + +## Development + +In a development environment, the default branch (commonly the `main` branch) is a read-only branch found in the IDE's connected repositories, which you can use to create development branches. Identifying a custom branch overrides this default behavior. Instead, your custom branch becomes read-only and can be used to create development branches. You will no longer be able to make commits to the custom branch from within the dbt Cloud IDE. + +For example, you can use the `develop` branch of a connected repository. Edit an environment, select **Only run on a custom branch** in **General settings** , enter **develop** as the name of your custom branch. + + + +## Deployment + +When running jobs in a deployment environment, dbt will clone your project from your connected repository before executing your models. By default, dbt uses the default branch of your repository (commonly the `main` branch). To specify a different version of your project for dbt to execute during job runs in a particular environment, you can edit the Custom Branch setting as shown in the previous steps. \ No newline at end of file diff --git a/website/docs/faqs/Environments/delete-environment-job.md b/website/docs/faqs/Environments/delete-environment-job.md new file mode 100644 index 00000000000..eb9ac511a7c --- /dev/null +++ b/website/docs/faqs/Environments/delete-environment-job.md @@ -0,0 +1,48 @@ +--- +title: How to delete a job or environment in dbt Cloud? +description: "How to delete a job or environment" +sidebar_label: 'Delete a job or environment' +id: delete-environment-job +--- + + +To delete an environment or job in dbt Cloud, you must have a `developer` [license](/docs/cloud/manage-access/seats-and-users) and have the necessary [access permissions](/docs/cloud/manage-access/about-user-access). + +## Delete a job + +To delete a job or multiple jobs in dbt Cloud: + +1. Click **Deploy** on the navigation header. +2. Click **Jobs** and select the job(s) you want to delete. +3. Click **Settings** on the top right of the page and then click **Edit**. +4. Scroll to the bottom of the page and click **Delete** to delete the job.
          + +
          + +
          Delete a job
          +
          + +5. Confirm your action in the **Confirm Delete** pop-up by clicking **Confirm Delete** in the bottom right to delete the job immediately. This action cannot be undone. However, you can create a new job with the same information if the deletion was made in error. + +Refresh the page, and the deleted job should now be gone. If you want to delete multiple jobs, you'll need to perform these steps for each job. + +## Delete an environment + +Deleting an environment automatically deletes its associated job(s). If you want to keep those jobs, move them to a different environment first. To delete an environment in dbt Cloud: + +1. Click **Deploy** on the navigation header and then click **Environments** +2. Select the Environment you want to delete. +3. Click **Settings** on the top right of the page and then click **Edit**. +4. Scroll to the bottom of the page and click **Delete** to delete the environment.
          + +
          + +
          Delete an environment
          +
          + +5. Confirm your action in the **Confirm Delete** pop-up by clicking **Confirm Delete** in the bottom right to delete the environment immediately. This action cannot be undone. However, you can create a new environment with the same information if the deletion was made in error.

          + + +Refresh your page, and the deleted environment should now be gone. If you want to delete multiple environments, you'll need to perform these steps to delete each one. + +If you're having any issues, feel free to [contact us](mailto:support@getdbt.com) for additional help. diff --git a/website/docs/faqs/Environments/profile-env-vars.md b/website/docs/faqs/Environments/profile-env-vars.md index d9cea7946de..6b10cb5a5e8 100644 --- a/website/docs/faqs/Environments/profile-env-vars.md +++ b/website/docs/faqs/Environments/profile-env-vars.md @@ -4,4 +4,4 @@ description: "Use env_var in your profile" sidebar_label: 'Use env_var in your profile' id: profile-env-vars --- -Yes! Check out the docs on [environment variables](env_var) for more information. +Yes! Check out the docs on [environment variables](/reference/dbt-jinja-functions/env_var) for more information. diff --git a/website/docs/faqs/Environments/profile-name.md b/website/docs/faqs/Environments/profile-name.md index 9f9d8b0ae59..aade6c252db 100644 --- a/website/docs/faqs/Environments/profile-name.md +++ b/website/docs/faqs/Environments/profile-name.md @@ -4,4 +4,4 @@ description: "Use company name for profile name" sidebar_label: 'Naming your profile' id: profile-name --- -We typically use a company name for a profile name, and then use targets to differentiate between `dev` and `prod`. Check out the docs on [managing environments](/docs/collaborate/environments) for more information. +We typically use a company name for a profile name, and then use targets to differentiate between `dev` and `prod`. Check out the docs on [environments in dbt Core](/docs/core/dbt-core-environments) for more information. diff --git a/website/docs/faqs/Environments/target-names.md b/website/docs/faqs/Environments/target-names.md index 6d3cd78b8e0..2619e31c2c2 100644 --- a/website/docs/faqs/Environments/target-names.md +++ b/website/docs/faqs/Environments/target-names.md @@ -5,4 +5,4 @@ sidebar_label: 'Naming your target' id: target-names --- -We typically use targets to differentiate between development and production runs of dbt, naming the targets `dev` and `prod` respectively. Check out the docs on [managing environments](/docs/collaborate/environments) for more information. +We typically use targets to differentiate between development and production runs of dbt, naming the targets `dev` and `prod`, respectively. Check out the docs on [managing environments in dbt Core](/docs/core/dbt-core-environments) for more information. diff --git a/website/docs/faqs/Git/git-migration.md b/website/docs/faqs/Git/git-migration.md new file mode 100644 index 00000000000..775ae3679e3 --- /dev/null +++ b/website/docs/faqs/Git/git-migration.md @@ -0,0 +1,26 @@ +--- +title: "How to migrate git providers" +sidebar_label: "How to migrate git providers" +id: "git-migration" +hide_table_of_contents: true +description: "Learn how to migrate git providers in dbt Cloud with minimal disruption." +tags: [Git] +--- + +To migrate from one git provider to another, refer to the following steps to avoid minimal disruption: + +1. Outside of dbt Cloud, you'll need to import your existing repository into your new provider. + + As an example, if you're migrating from GitHub to Azure DevOps, you'll need to import your existing repository (GitHub) into your new git provider (Azure DevOps). For detailed steps on how to do this, refer to your git provider's documentation (Such as [GitHub](https://docs.github.com/en/migrations/importing-source-code/using-github-importer/importing-a-repository-with-github-importer), [GitLab](https://docs.gitlab.com/ee/user/project/import/repo_by_url.html), [Azure DevOps](https://learn.microsoft.com/en-us/azure/devops/repos/git/import-git-repository?view=azure-devops)) + +2. Go back to dbt Cloud and set up your [integration for the new git provider](/docs/cloud/git/connect-github), if needed. +3. Disconnect the old repository in dbt Cloud by going to **Account Settings** and then **Projects**. Click on the **Repository** link, then click **Edit** and **Disconnect**. + + + +4. On the same page, connect to the new git provider repository by clicking **Configure Repository** + - If you're using the native integration, you may need to OAuth to it. + +5. That's it, you should now be connected to the new git provider! 🎉 + +Note — As a tip, we recommend you refresh your page and dbt Cloud IDE before performing any actions. diff --git a/website/docs/faqs/Git/gitignore.md b/website/docs/faqs/Git/gitignore.md index fb097bb4043..6bda9611733 100644 --- a/website/docs/faqs/Git/gitignore.md +++ b/website/docs/faqs/Git/gitignore.md @@ -1,25 +1,233 @@ --- -title: Why can't I checkout a branch or create a new branch? -description: "Add or fill in gitignore file" -sidebar_label: 'Unable to checkout or create branch' +title: How can I fix my .gitignore file? +description: "Use these instructions to fix your gitignore file" +sidebar_label: 'How to fix your .gitignore file' id: gitignore --- -If you're finding yourself unable to revert changes, check out a branch or click commit - this is usually do to your project missing a [.gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) file OR your gitignore file doesn't contain the necessary content inside the folder. +A `.gitignore` file specifies which files git should intentionally ignore or 'untrack'. dbt Cloud indicates untracked files in the project file explorer pane by putting the file or folder name in *italics*. -This is what causes that 'commit' git action button to display. No worries though - to fix this, you'll need to complete the following steps in order: +If you encounter issues like problems reverting changes, checking out or creating a new branch, or not being prompted to open a pull request after a commit in the dbt Cloud IDE — this usually indicates a problem with the [.gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) file. The file may be missing or lacks the required entries for dbt Cloud to work correctly. -1. In the Cloud IDE, add the missing .gitignore file or contents to your project. You'll want to make sure the .gitignore file includes the following: +### Fix in the dbt Cloud IDE - ```shell - target/ - dbt_modules/ - dbt_packages/ - logs/ - ``` +To resolve issues with your `gitignore` file, adding the correct entries won't automatically remove (or 'untrack') files or folders that have already been tracked by git. The updated `gitignore` will only prevent new files or folders from being tracked. So you'll need to first fix the `gitignore` file, then perform some additional git operations to untrack any incorrect files or folders. -2. Once you've added that, make sure to save and commit. + -3. Navigate to the same branch in your remote repository (which can be accessed directly through your git provider's web interface) and delete the logs, target, and dbt_modules/dbt_packages folders. +1. Launch the Cloud IDE into the project that is being fixed, by selecting **Develop** on the menu bar. +2. In your **File Explorer**, check to see if a `.gitignore` file exists at the root of your dbt project folder. If it doesn't exist, create a new file. +3. Open the new or existing `gitignore` file, and add the following: -4. Go back into the Cloud IDE and reclone your repository. This can be done by clicking on the green "ready" in the bottom right corner of the IDE (next to the command bar), and then clicking the orange "reclone repo" button in the pop up. +```bash +# ✅ Correct +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` + +* **Note** — You can place these lines anywhere in the file, as long as they're on separate lines. The lines shown are wildcards that will include all nested files and folders. Avoid adding a trailing `'*'` to the lines, such as `target/*`. + +For more info on `gitignore` syntax, refer to the [Git docs](https://git-scm.com/docs/gitignore). + +4. Save the changes but _don't commit_. +5. Restart the IDE by clicking on the three dots next to the **IDE Status button** on the lower right corner of the IDE screen and select **Restart IDE**. + + + +6. Once the IDE restarts, go to the **File Explorer** to delete the following files or folders (if they exist). No data will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. **Save** and then **Commit and sync** the changes. +8. Restart the IDE again using the same procedure as step 5. +9. Once the IDE restarts, use the **Create a pull request** (PR) button under the **Version Control** menu to start the process of integrating the changes. +10. When the git provider's website opens to a page with the new PR, follow the necessary steps to complete and merge the PR into the main branch of that repository. + + * **Note** — The 'main' branch might also be called 'master', 'dev', 'qa', 'prod', or something else depending on the organizational naming conventions. The goal is to merge these changes into the root branch that all other development branches are created from. + +11. Return to the dbt Cloud IDE and use the **Change Branch** button, to switch to the main branch of the project. +12. Once the branch has changed, click the **Pull from remote** button to pull in all the changes. +13. Verify the changes by making sure the files/folders in the `.gitignore `file are in italics. + + + +### Fix in the git provider + +Sometimes it's necessary to use the git providers web interface to fix a broken `.gitignore` file. Although the specific steps may vary across providers, the general process remains the same. + +There are two options for this approach: editing the main branch directly if allowed, or creating a pull request to implement the changes if required: + + + + + +When permissions allow it, it's possible to edit the `.gitignore` directly on the main branch of your repo. Here are the following steps: + +1. Go to your repository's web interface. +2. Switch to the main branch and the root directory of your dbt project. +3. Find the `.gitignore` file. Create a blank one if it doesn't exist. +4. Edit the file in the web interface, adding the following entries: +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` + +5. Commit (save) the file. +6. Delete the following folders from the dbt project root, if they exist. No data or code will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. Commit (save) the deletions to the main branch. +8. Switch to the dbt Cloud IDE, and open the project that you're fixing. +9. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. + * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +10. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +11. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. +12. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! + + + + + +If you can't edit the `.gitignore` directly on the main branch of your repo, follow these steps: + +1. Go to your repository's web interface. +2. Switch to an existing development branch, or create a new branch just for these changes (This is often faster and cleaner). +3. Find the `.gitignore` file. Create a blank one if it doesn't exist. +4. Edit the file in the web interface, adding the following entries: + +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` +5. Commit (save) the file. +6. Delete the following folders from the dbt project root, if they exist. No data or code will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. Commit (save) the deleted folders. +8. Open a merge request using the git provider web interface. The merge request should attempt to merge the changes into the 'main' branch that all development branches are created from. +9. Follow the necessary procedures to get the branch approved and merged into the 'main' branch. You can delete the branch after the merge is complete. +10. Once the merge is complete, go back to the dbt Cloud IDE, and open the project that you're fixing. +11. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. + * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +12. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +13. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. +14. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! + + + + + + + + +1. Launch the Cloud IDE into the project that is being fixed, by selecting **Develop** on the menu bar. +2. In your **File Explorer**, check to see if a `.gitignore` file exists at the root of your dbt project folder. If it doesn't exist, create a new file. +3. Open the new or existing `gitignore` file, and add the following: + +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` + + * **Note** — You can place these lines anywhere in the file, as long as they're on separate lines. The lines shown are wildcards that will include all nested file and folders. Avoid adding a trailing `'*'` to the lines, such as `target/*`. + +For more info on `gitignore` syntax, refer to the [Git docs](https://git-scm.com/docs/gitignore). + +4. Save the changes but _don't commit_. +5. Restart the IDE by clicking on the three dots next to the **IDE Status button** on the lower right corner of the IDE screen and select **Restart IDE**. + + + +6. Once the IDE restarts, go to the **File Explorer** to delete the following files or folders (if they exist). No data will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. **Save** and then **Commit and sync** the changes. +8. Restart the IDE again using the same procedure as step 5. +9. Once the IDE restarts, use the 'Create a pull request' (PR) button under the **Version Control** menu to start the process of integrating the changes. +10. When the git provider's website opens to a page with the new PR, follow the necessary steps to compelete and merge the PR into the main branch of that repository. + + * **Note** — The 'main' branch might also be called 'master', 'dev', 'qa', 'prod', or something else depending on the organizational naming conventions. The goal is to merge these changes into the root branch that all other development branches are created from. + +11. Return to the dbt Cloud IDE and use the **Change Branch** button to switch to the main branch of the project. +12. Once the branch has changed, click the **Pull from remote** button to pull in all the changes. +13. Verify the changes by making sure the files/folders in the `.gitignore `file are in italics. + + + + +### Fix in the git provider + +Sometimes it's necessary to use the git providers web interface to fix a broken `.gitignore` file. Although the specific steps may vary across providers, the general process remains the same. + +There are two options for this approach: editing the main branch directly if allowed, or creating a pull request to implement the changes if required: + + + + + +When permissions allow it, it's possible to edit the `.gitignore` directly on the main branch of your repo. Here are the following steps: + +1. Go to your repository's web interface. +2. Switch to the main branch, and the root directory of your dbt project. +3. Find the `.gitignore` file. Create a blank one if it doesn't exist. +4. Edit the file in the web interface, adding the following entries: +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` +5. Commit (save) the file. +6. Delete the following folders from the dbt project root, if they exist. No data or code will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. Commit (save) the deletions to the main branch. +8. Switch to the dbt Cloud IDE, and open the project that you're fixing. +9. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. + * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +10. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +11. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. +12. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! + + + + +If you can't edit the `.gitignore` directly on the main branch of your repo, follow these steps: + +1. Go to your repository's web interface. +2. Switch to an existing development branch, or create a new branch just for these changes (This is often faster and cleaner). +3. Find the `.gitignore` file. Create a blank one if it doesn't exist. +4. Edit the file in the web interface, adding the following entries: +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` +5. Commit (save) the file. +6. Delete the following folders from the dbt project root, if they exist. No data or code will be lost: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +7. Commit (save) the deleted folders. +8. Open a merge request using the git provider web interface. The merge request should be attempting to merge the changes into the 'main' branch that all development branches are created from. +9. Follow the necessary procedures to get the branch approved and merged into the 'main' branch. You can delete the branch after the merge is complete. +10. Once the merge is complete, go back to the dbt Cloud IDE, and open the project that you're fixing. +11. Reclone your repo in the IDE by clicking on the three dots next to the **IDE Status** button on the lower right corner of the IDE screen, then select **Reclone Repo**. + * **Note** — Any saved but uncommitted changes will be lost, so make sure you copy any modified code that you want to keep in a temporary location outside of dbt Cloud. +12. Once you reclone the repo, open the `.gitignore` file in the branch you're working in. If the new changes aren't included, you'll need to merge the latest commits from the main branch into your working branch. +13. Go to the **File Explorer** to verify the `.gitignore` file contains the correct entries and make sure the untracked files/folders in the .gitignore file are in *italics*. +14. Great job 🎉! You've configured the `.gitignore` correctly and can continue with your development! + + + + + + +For more info, refer to this [detailed video](https://www.loom.com/share/9b3b8e2b617f41a8bad76ec7e42dd014) for additional guidance. diff --git a/website/docs/faqs/Git/managed-repo.md b/website/docs/faqs/Git/managed-repo.md index c68bd5ab896..17b75256fb6 100644 --- a/website/docs/faqs/Git/managed-repo.md +++ b/website/docs/faqs/Git/managed-repo.md @@ -5,6 +5,6 @@ sidebar_label: 'How to request your managed repository zipfile' id: managed-repo --- -dbt Labs can send your managed repository through a ZIP file in its current state for you to push up to a git provider. After that, you'd just need to switch over to the [repo in your project](/docs/collaborate/git/import-a-project-by-git-url) to point to the new repository. +dbt Labs can send your managed repository through a ZIP file in its current state for you to push up to a git provider. After that, you'd just need to switch over to the [repo in your project](/docs/cloud/git/import-a-project-by-git-url) to point to the new repository. When you're ready to do this, [contact the dbt Labs Support team](mailto:support@getdbt.com) with your request and your managed repo URL, which you can find by navigating to your project setting. To find project settings, click the gear icon in the upper right, select **Account settings**, click **Projects**, and then select your project. Under **Repository** in the project details page, you can find your managed repo URL. diff --git a/website/docs/faqs/Jinja/jinja-whitespace.md b/website/docs/faqs/Jinja/jinja-whitespace.md index 90a7dc5d2e1..49ced7183b7 100644 --- a/website/docs/faqs/Jinja/jinja-whitespace.md +++ b/website/docs/faqs/Jinja/jinja-whitespace.md @@ -7,6 +7,6 @@ id: jinja-whitespace This is known as "whitespace control". -Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example. +Use a minus sign (`-`, e.g. `{{- ... -}}`, `{%- ... %}`, `{#- ... -#}`) at the start or end of a block to strip whitespace before or after the block (more docs [here](https://jinja.palletsprojects.com/page/templates/#whitespace-control)). Check out the [tutorial on using Jinja](/guides/advanced/using-jinja#use-whitespace-control-to-tidy-up-compiled-code) for an example. Take caution: it's easy to fall down a rabbit hole when it comes to whitespace control! diff --git a/website/docs/faqs/Jinja/quoting-column-names.md b/website/docs/faqs/Jinja/quoting-column-names.md index aff4a36b886..e0cdeabc629 100644 --- a/website/docs/faqs/Jinja/quoting-column-names.md +++ b/website/docs/faqs/Jinja/quoting-column-names.md @@ -5,7 +5,7 @@ sidebar_label: 'Why quote column names in Jinja' id: quoting-column-names --- -In the [macro example](jinja-macros#macros) we passed the column name `amount` quotes: +In the [macro example](/docs/build/jinja-macros#macros) we passed the column name `amount` quotes: ```sql {{ cents_to_dollars('amount') }} as amount_usd @@ -19,4 +19,4 @@ Quoting in Jinja can take a while to get used to! The rule is that you're within Single and double quotes are equivalent in Jinja – just make sure you match them appropriately. -And if you do need to pass a variable as an argument, make sure you [don't nest your curlies](dont-nest-your-curlies) +And if you do need to pass a variable as an argument, make sure you [don't nest your curlies](/docs/building-a-dbt-project/dont-nest-your-curlies) diff --git a/website/docs/faqs/Models/available-configurations.md b/website/docs/faqs/Models/available-configurations.md index 5d59dfd4246..973bba06b7c 100644 --- a/website/docs/faqs/Models/available-configurations.md +++ b/website/docs/faqs/Models/available-configurations.md @@ -6,10 +6,10 @@ id: available-configurations --- You can also configure: -* [tags](resource-configs/tags) to support easy categorization and graph selection -* [custom schemas](resource-configs/schema) to split your models across multiple schemas -* [aliases](resource-configs/alias) if your / name should differ from the filename -* Snippets of SQL to run at the start or end of a model, known as [hooks](hooks-operations) +* [tags](/reference/resource-configs/tags) to support easy categorization and graph selection +* [custom schemas](/reference/resource-properties/schema) to split your models across multiple schemas +* [aliases](/reference/resource-configs/alias) if your / name should differ from the filename +* Snippets of SQL to run at the start or end of a model, known as [hooks](/docs/build/hooks-operations) * Warehouse-specific configurations for performance (e.g. `sort` and `dist` keys on Redshift, `partitions` on BigQuery) -Check out the docs on [model configurations](model-configs) to learn more. +Check out the docs on [model configurations](/reference/model-configs) to learn more. diff --git a/website/docs/faqs/Models/available-materializations.md b/website/docs/faqs/Models/available-materializations.md index f8b1530aca4..011d3ba3fb0 100644 --- a/website/docs/faqs/Models/available-materializations.md +++ b/website/docs/faqs/Models/available-materializations.md @@ -5,6 +5,7 @@ sidebar_label: 'Materializations available' id: available-materializations --- -dbt ships with four materializations: `view`, `table`, `incremental` and `ephemeral`. Check out the documentation on [materializations](materializations) for more information on each of these options. +dbt ships with five materializations: `view`, `table`, `incremental`, `ephemeral` and `materialized_view`. +Check out the documentation on [materializations](/docs/build/materializations) for more information on each of these options. -You can also create your own [custom materializations](creating-new-materializations), if required however this is an advanced feature of dbt. +You can also create your own [custom materializations](/guides/advanced/creating-new-materializations), if required however this is an advanced feature of dbt. diff --git a/website/docs/faqs/Models/configurable-model-path.md b/website/docs/faqs/Models/configurable-model-path.md index 9d16bb1f05f..c34112a5fe1 100644 --- a/website/docs/faqs/Models/configurable-model-path.md +++ b/website/docs/faqs/Models/configurable-model-path.md @@ -6,13 +6,7 @@ id: configurable-model-path --- - - -- **v1.0.0:** The config 'source-path' has been deprecated in favor of [`model-paths`](model-paths). - - - -By default, dbt expects your seed files to be located in the `models` subdirectory of your project. +By default, dbt expects the files defining your models to be located in the `models` subdirectory of your project. To change this, update the [model-paths](reference/project-configs/model-paths.md) configuration in your `dbt_project.yml` file, like so: diff --git a/website/docs/faqs/Models/create-dependencies.md b/website/docs/faqs/Models/create-dependencies.md index 4a84e3ab7d7..6a01aa18dca 100644 --- a/website/docs/faqs/Models/create-dependencies.md +++ b/website/docs/faqs/Models/create-dependencies.md @@ -6,7 +6,7 @@ id: create-dependencies --- -When you use the `ref` [function](dbt-jinja-functions/ref), dbt automatically infers the dependencies between models. +When you use the `ref` [function](/reference/dbt-jinja-functions/ref), dbt automatically infers the dependencies between models. For example, consider a model, `customer_orders`, like so: @@ -44,4 +44,4 @@ Found 2 models, 28 tests, 0 snapshots, 0 analyses, 130 macros, 0 operations, 0 s Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2 ``` -To learn more about building a dbt project, we recommend you complete the [Getting Started guide](/docs/get-started/getting-started/overview). +To learn more about building a dbt project, we recommend you complete the [quickstart guide](/quickstarts). diff --git a/website/docs/faqs/Models/reference-models-in-another-project.md b/website/docs/faqs/Models/reference-models-in-another-project.md deleted file mode 100644 index 0a06b04e7bd..00000000000 --- a/website/docs/faqs/Models/reference-models-in-another-project.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: How can I reference models or macros in another project? -description: "Use packages to add another project to your dbt project" -sidebar_label: 'Reference models or macros in another project' -id: reference-models-in-another-project - ---- - -You can use [packages]/docs/build/packages) to add another project to your dbt -project, including other projects you've created. Check out the [docs](/docs/build/packages) -for more information! diff --git a/website/docs/faqs/Models/run-downtime.md b/website/docs/faqs/Models/run-downtime.md index 4c19511061b..1de93b8d263 100644 --- a/website/docs/faqs/Models/run-downtime.md +++ b/website/docs/faqs/Models/run-downtime.md @@ -7,4 +7,4 @@ id: run-downtime --- Nope! The SQL that dbt generates behind the scenes ensures that any relations are replaced atomically (i.e. your business users won't experience any downtime). -The implementation of this varies on each warehouse, check out the [logs](/docs/faqs/Runs/checking-logs) to see the SQL dbt is executing. +The implementation of this varies on each warehouse, check out the [logs](/faqs/Runs/checking-logs) to see the SQL dbt is executing. diff --git a/website/docs/faqs/Models/source-quotes.md b/website/docs/faqs/Models/source-quotes.md index da5a278669b..8905b6cf684 100644 --- a/website/docs/faqs/Models/source-quotes.md +++ b/website/docs/faqs/Models/source-quotes.md @@ -10,7 +10,7 @@ This is reasonably common on Snowflake in particular. By default, dbt will not quote the database, schema, or identifier for the source tables that you've specified. -To force dbt to quote one of these values, use the [`quoting` property](resource-properties/quoting): +To force dbt to quote one of these values, use the [`quoting` property](/reference/resource-properties/quoting): diff --git a/website/docs/faqs/Models/specifying-column-types.md b/website/docs/faqs/Models/specifying-column-types.md index 1adaf30fdce..8e8379c4ec1 100644 --- a/website/docs/faqs/Models/specifying-column-types.md +++ b/website/docs/faqs/Models/specifying-column-types.md @@ -40,4 +40,4 @@ To define additional column options: * Rather than enforcing uniqueness and not-null constraints on your column, use dbt's [testing](/docs/build/tests) functionality to check that your assertions about your model hold true. * Rather than creating default values for a column, use SQL to express defaults (e.g. `coalesce(updated_at, current_timestamp()) as updated_at`) -* In edge-cases where you _do_ need to alter a column (e.g. column-level encoding on Redshift), consider implementing this via a [post-hook](pre-hook-post-hook). +* In edge-cases where you _do_ need to alter a column (e.g. column-level encoding on Redshift), consider implementing this via a [post-hook](/reference/resource-configs/pre-hook-post-hook). diff --git a/website/docs/faqs/Models/sql-dialect.md b/website/docs/faqs/Models/sql-dialect.md index df7168a1b60..500e8cf2091 100644 --- a/website/docs/faqs/Models/sql-dialect.md +++ b/website/docs/faqs/Models/sql-dialect.md @@ -9,7 +9,7 @@ dbt can feel like magic, but it isn't actually magic. Under the hood, it's runni As such, your models should just use the **SQL dialect of your own database**. Then, when dbt wraps your `select` statements in the appropriate or , it will use the correct DML for your warehouse — all of this logic is written in to dbt. -You can find more information about the databases, platforms, and query engines that dbt supports in the [Supported Data Platforms](supported-data-platforms) docs. +You can find more information about the databases, platforms, and query engines that dbt supports in the [Supported Data Platforms](/docs/supported-data-platforms) docs. Want to go a little deeper on how this works? Consider a snippet of SQL that works on each warehouse: diff --git a/website/docs/faqs/Models/unique-model-names.md b/website/docs/faqs/Models/unique-model-names.md index b1a523427c0..c721fca7c6e 100644 --- a/website/docs/faqs/Models/unique-model-names.md +++ b/website/docs/faqs/Models/unique-model-names.md @@ -6,6 +6,20 @@ id: unique-model-names --- -Yes! To build dependencies between models, you need to use the `ref` function. The `ref` function only takes one argument — the model name (i.e. the filename). As a result, these model names need to be unique, _even if they are in distinct folders_. + + +Within one project: yes! To build dependencies between models, you need to use the `ref` function, and pass in the model name as an argument. dbt uses that model name to uniquely resolve the `ref` to a specific model. As a result, these model names need to be unique, _even if they are in distinct folders_. + +A model in one project can have the same name as a model in another project (installed as a dependency). dbt uses the project name to uniquely identify each model. We call this "namespacing." If you `ref` a model with a duplicated name, it will resolve to the model within the same namespace (package or project), or raise an error because of an ambiguous reference. Use [two-argument `ref`](/reference/dbt-jinja-functions/ref#two-argument-variant) to disambiguate references by specifying the namespace. + +Those models will still need to land in distinct locations in the data warehouse. Read the docs on [custom aliases](/docs/build/custom-aliases) and [custom schemas](/docs/build/custom-schemas) for details on how to achieve this. + + + + + +Yes! To build dependencies between models, you need to use the `ref` function, and pass in the model name as an argument. dbt uses that model name to uniquely resolve the `ref` to a specific model. As a result, these model names need to be unique, _even if they are in distinct folders_. Often, this question comes up because users want to give two models the same name in their warehouse, splitting them across separate schemas (e.g. `stripe.users` and `app.users`). Checkout the docs on [custom aliases](/docs/build/custom-aliases) and [custom schemas](/docs/build/custom-schemas) to achieve this. + + diff --git a/website/docs/docs/get-started/getting-started/add-a-seed.md b/website/docs/faqs/Project/add-a-seed.md similarity index 78% rename from website/docs/docs/get-started/getting-started/add-a-seed.md rename to website/docs/faqs/Project/add-a-seed.md index 85080109cd4..60cbcc9d91c 100644 --- a/website/docs/docs/get-started/getting-started/add-a-seed.md +++ b/website/docs/faqs/Project/add-a-seed.md @@ -3,10 +3,6 @@ title: Add a seed file id: add-a-seed description: Learn how to add a seed file to your project --- -:::caution Heads up! -You'll need to have completed the earlier parts of this guide to -complete this lesson -::: 1. Add a seed file: diff --git a/website/docs/faqs/Project/consolidate-projects.md b/website/docs/faqs/Project/consolidate-projects.md new file mode 100644 index 00000000000..01a4e24907c --- /dev/null +++ b/website/docs/faqs/Project/consolidate-projects.md @@ -0,0 +1,32 @@ +--- +title: How can I consolidate projects in dbt Cloud? +description: "Consolidating projects in dbt Cloud" +sidebar_label: 'How to consolidate projects' +id: consolidate-projects + +--- + +Consolidating your dbt projects can be an enormous task, and there is no universal solution. But, there are some common approaches to project consolidation in dbt Cloud that you can follow, depending on the scope of the work that needs to be done. + +If you have multiple projects that contain production-worthy code, there are rarely straightforward solutions to merging them. Let's suppose you have `Main Project` and `Smaller Subset Project`. + +## Files and Folders + +### Git and the local directory + +Reference the [merge git commands](https://gist.github.com/msrose/2feacb303035d11d2d05) to help complete the migration plan. Using the commands will help retain git commit history, but you might result in duplicate folders called `models`, `tests`, etc. You will most likely still have to move files around manually. + +Another option would be to use an external code editor (for example, VS Code) to move files from the `Smaller Subset Project` to the `Main Project`. This is what internal dbt Labs experts recommend to stay informed about what comes over to the main project and also allows you to be more aware of the incoming files, with the ability to make any minor tweaks to folder hierarchy that you might want to do at the same time. + +### Manual migration with multiple browser tabs + +If you only have a couple of models or macros that you want to consolidate, copy the raw file contents from your git provider in `Smaller Subset Project`. Then, in the dbt Cloud IDE, paste the contents into a new file in your `Main Project`. + +Alternatively, you can download those files from your git provider (`Smaller Subset Project` repo) and upload them back to your other repository (`Main Project` repo). This doesn’t scale well and could bypass change controls, so it might only be a viable solution for organizations with only a few files. + +## Production jobs +If you have multiple projects with deployment environments deploying jobs, this poses another challenge. Assuming all the models from `Smaller Subset Project` can be consolidated into `Main Project`, your commands within your jobs will take on a new meaning. In lieu of refactoring your global job strategy at the same time, you can add tags to the incoming project models and utilize that in your job command syntax, with the help of node selection syntax. + +Main Project job command example: `dbt build --exclude tag:smaller_subset_project` + +Smaller Subset Project commands: `dbt build --select tag:smaller_subset_project` diff --git a/website/docs/faqs/Project/dbt-source-freshness.md b/website/docs/faqs/Project/dbt-source-freshness.md index 496d5061eab..e2554579ffc 100644 --- a/website/docs/faqs/Project/dbt-source-freshness.md +++ b/website/docs/faqs/Project/dbt-source-freshness.md @@ -11,4 +11,4 @@ The `dbt source freshness` command will output a pass/warning/error status for e Additionally, dbt will write the freshness results to a file in the `target/` directory called `sources.json` by default. You can also override this destination, use the `-o` flag to the `dbt source freshness` command. -After enabling source freshness within a job, configure [Artifacts](docs/dbt-cloud/using-dbt-cloud/artifacts) in your **Project Details** page, which you can find by clicking the gear icon and then selecting **Account settings**. You can see the current status for source freshness by clicking **View Sources** in the job page. +After enabling source freshness within a job, configure [Artifacts](/docs/deploy/artifacts) in your **Project Details** page, which you can find by clicking the gear icon and then selecting **Account settings**. You can see the current status for source freshness by clicking **View Sources** in the job page. diff --git a/website/docs/faqs/Project/debugging-jinja.md b/website/docs/faqs/Project/debugging-jinja.md index bebc76b4f99..824229f98f3 100644 --- a/website/docs/faqs/Project/debugging-jinja.md +++ b/website/docs/faqs/Project/debugging-jinja.md @@ -8,4 +8,4 @@ id: debugging-jinja You should get familiar with checking the compiled SQL in `target/compiled//` and the logs in `logs/dbt.log` to see what dbt is running behind the scenes. -You can also use the [log](log) function to debug Jinja by printing objects to the command line. +You can also use the [log](/reference/dbt-jinja-functions/log) function to debug Jinja by printing objects to the command line. diff --git a/website/docs/faqs/Project/delete-a-project.md b/website/docs/faqs/Project/delete-a-project.md new file mode 100644 index 00000000000..5fde3fee9cd --- /dev/null +++ b/website/docs/faqs/Project/delete-a-project.md @@ -0,0 +1,18 @@ +--- +title: How do I delete a project in dbt Cloud? +description: "Deleting a project in dbt Cloud" +sidebar_label: 'How to delete a project' +id: delete-a-project + +--- +To delete a project in dbt Cloud, you must be the account owner or have admin privileges. + +1. From dbt Cloud, click the gear icon at the top right corner and select **Account Settings**. + + + +2. In **Account Settings**, select **Projects**. Click the project you want to delete from the **Projects** page. +3. Click the edit icon in the lower right-hand corner of the **Project Details**. A **Delete** option will appear on the left side of the same details view. +4. Select **Delete**. Confirm the action to immediately delete the user without additional password prompts. There will be no account password prompt, and the project is deleted immediately after confirmation. Once a project is deleted, this action cannot be undone. + + diff --git a/website/docs/faqs/Project/example-projects.md b/website/docs/faqs/Project/example-projects.md index d4bebc7206d..f59d6e56e78 100644 --- a/website/docs/faqs/Project/example-projects.md +++ b/website/docs/faqs/Project/example-projects.md @@ -8,10 +8,12 @@ id: example-projects Yes! -* **Getting Started Tutorial:** You can build your own example dbt project in the [Getting Started guide](/docs/get-started/getting-started/overview) +* **Quickstart Tutorial:** You can build your own example dbt project in the [quickstart guide](/quickstarts) * **Jaffle Shop:** A demonstration project (closely related to the tutorial) for a fictional ecommerce store ([source code](https://github.com/dbt-labs/jaffle_shop)) * **MRR Playbook:** A demonstration project that models subscription revenue ([source code](https://github.com/dbt-labs/mrr-playbook), [docs](https://www.getdbt.com/mrr-playbook/#!/overview)) * **Attribution Playbook:** A demonstration project that models marketing attribution ([source code](https://github.com/dbt-labs/attribution-playbook), [docs](https://www.getdbt.com/attribution-playbook/#!/overview)) * **GitLab:** Gitlab's internal dbt project is open source and is a great example of how to use dbt at scale ([source code](https://gitlab.com/gitlab-data/analytics/-/tree/master/transform/snowflake-dbt), [docs](https://dbt.gitlabdata.com/)) +* **dummy-dbt:** A containerized dbt project that populates the Sakila database in Postgres and populates dbt seeds, models, snapshots, and tests. The project can be used for testing and experimentation purposes ([source code](https://github.com/gmyrianthous/dbt-dummy)) +* **Google Analytics 4:** A demonstration project that transforms the Google Analytics 4 BigQuery exports to various models ([source code](https://github.com/stacktonic-com/stacktonic-dbt-example-project), [docs](https://stacktonic.com/article/google-analytics-big-query-and-dbt-a-dbt-example-project)) If you have an example project to add to this list, suggest an edit by clicking **Edit this page** below. diff --git a/website/docs/faqs/Project/multiple-resource-yml-files.md b/website/docs/faqs/Project/multiple-resource-yml-files.md index 06738aeae65..422b7beb702 100644 --- a/website/docs/faqs/Project/multiple-resource-yml-files.md +++ b/website/docs/faqs/Project/multiple-resource-yml-files.md @@ -7,6 +7,6 @@ id: multiple-resource-yml-files --- It's up to you: - Some folks find it useful to have one file per model (or source / snapshot / seed etc) -- Some find is useful to have one per directory, documenting and testing multiple models in one file +- Some find it useful to have one per directory, documenting and testing multiple models in one file -Choose what works for your team. We have more recommendations in our guide on [structuring dbt project](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355). +Choose what works for your team. We have more recommendations in our guide on [structuring dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). diff --git a/website/docs/faqs/Project/properties-not-in-config.md b/website/docs/faqs/Project/properties-not-in-config.md index 9e334c55ce3..d1aea32b687 100644 --- a/website/docs/faqs/Project/properties-not-in-config.md +++ b/website/docs/faqs/Project/properties-not-in-config.md @@ -6,7 +6,7 @@ id: properties-not-in-config --- -In v0.21, dbt added the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there. +dbt has the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there. Certain properties are special, because: - They have a unique Jinja rendering context @@ -15,11 +15,11 @@ Certain properties are special, because: - They're older properties that haven't yet been redefined as configs These properties are: -- [`description`](resource-properties/description) -- [`tests`](resource-properties/tests) +- [`description`](/reference/resource-properties/description) +- [`tests`](/reference/resource-properties/tests) - [`docs`](/reference/resource-configs/docs) - `columns` -- [`quote`](resource-properties/quote) -- [`source` properties](source-properties) (e.g. `loaded_at_field`, `freshness`) -- [`exposure` properties](exposure-properties) (e.g. `type`, `maturity`) -- [`macro` properties](macro-properties) (e.g. `arguments`) +- [`quote`](/reference/resource-properties/quote) +- [`source` properties](/reference/source-properties) (e.g. `loaded_at_field`, `freshness`) +- [`exposure` properties](/reference/exposure-properties) (e.g. `type`, `maturity`) +- [`macro` properties](/reference/macro-properties) (e.g. `arguments`) diff --git a/website/docs/faqs/Project/schema-yml-name.md b/website/docs/faqs/Project/schema-yml-name.md index 9f8683647ca..cb1c120df14 100644 --- a/website/docs/faqs/Project/schema-yml-name.md +++ b/website/docs/faqs/Project/schema-yml-name.md @@ -9,6 +9,6 @@ No! You can name this file whatever you want (including `whatever_you_want.yml`) * The file is in your `models/` directory¹ * The file has `.yml` extension -Check out the [docs](configs-and-properties) for more information. +Check out the [docs](/reference/configs-and-properties) for more information. ¹If you're declaring properties for seeds, snapshots, or macros, you can also place this file in the related directory — `seeds/`, `snapshots/` and `macros/` respectively. diff --git a/website/docs/faqs/Project/source-has-bad-name.md b/website/docs/faqs/Project/source-has-bad-name.md index 10fb2f03d78..19a56fe6574 100644 --- a/website/docs/faqs/Project/source-has-bad-name.md +++ b/website/docs/faqs/Project/source-has-bad-name.md @@ -8,7 +8,7 @@ id: source-has-bad-name By default, dbt will use the `name:` parameters to construct the source reference. -If these names are a little less-than-perfect, use the [schema](resource-properties/schema) and [identifier](identifier) properties to define the names as per the database, and use your `name:` property for the name that makes sense! +If these names are a little less-than-perfect, use the [schema](/reference/resource-properties/schema) and [identifier](/reference/resource-properties/identifier) properties to define the names as per the database, and use your `name:` property for the name that makes sense! diff --git a/website/docs/faqs/Project/source-in-different-database.md b/website/docs/faqs/Project/source-in-different-database.md index cdfa3c62b76..26c262cc435 100644 --- a/website/docs/faqs/Project/source-in-different-database.md +++ b/website/docs/faqs/Project/source-in-different-database.md @@ -6,7 +6,7 @@ id: source-in-different-database --- -Use the [`database` property](resource-properties/database) to define the database that the source is in. +Use the [`database` property](/reference/resource-properties/database) to define the database that the source is in. diff --git a/website/docs/faqs/Project/structure-a-project.md b/website/docs/faqs/Project/structure-a-project.md index e627c5e4793..5d73f9f25ba 100644 --- a/website/docs/faqs/Project/structure-a-project.md +++ b/website/docs/faqs/Project/structure-a-project.md @@ -8,4 +8,4 @@ id: structure-a-project There's no one best way to structure a project! Every organization is unique. -If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355). +If you're just getting started, check out how we (dbt Labs) [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). diff --git a/website/docs/faqs/Project/which-materialization.md b/website/docs/faqs/Project/which-materialization.md index 32599a61746..5b1928fcacf 100644 --- a/website/docs/faqs/Project/which-materialization.md +++ b/website/docs/faqs/Project/which-materialization.md @@ -7,4 +7,4 @@ id: which-materialization --- Start out with views, and then change models to tables when required for performance reasons (i.e. downstream queries have slowed). -Check out the [docs on materializations](materializations) for advice on when to use each . +Check out the [docs on materializations](/docs/build/materializations) for advice on when to use each . diff --git a/website/docs/faqs/Project/why-not-write-dml.md b/website/docs/faqs/Project/why-not-write-dml.md index cf778c2086c..fd2cea7d3ad 100644 --- a/website/docs/faqs/Project/why-not-write-dml.md +++ b/website/docs/faqs/Project/why-not-write-dml.md @@ -28,6 +28,6 @@ You can test your models, generate documentation, create snapshots, and more! #### You reduce your vendor lock in -SQL dialects tend to diverge the most in DML and DDL (rather than in `select` statements) — check out the example [here](sql-dialect). By writing less SQL, it can make a migration to a new database technology easier. +SQL dialects tend to diverge the most in DML and DDL (rather than in `select` statements) — check out the example [here](/faqs/models/sql-dialect). By writing less SQL, it can make a migration to a new database technology easier. -If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](creating-new-materializations). +If you do need to write custom DML, there are ways to do this in dbt using [custom materializations](/guides/advanced/creating-new-materializations). diff --git a/website/docs/faqs/Project/why-version-2.md b/website/docs/faqs/Project/why-version-2.md index c86aab37262..dc7e409716e 100644 --- a/website/docs/faqs/Project/why-version-2.md +++ b/website/docs/faqs/Project/why-version-2.md @@ -6,6 +6,18 @@ id: why-version-2 --- + + +Once upon a time, the structure of these `.yml` files was very different (s/o to anyone who was using dbt back then!). Adding `version: 2` allowed us to make this structure more extensible. + +Resource yml files do not currently require this config. We only support `version: 2` if it's specified. Although we do not expect to update yml files to `version: 3` soon, having this config will make it easier for us to introduce new structures in the future + + + + + Once upon a time, the structure of these `.yml` files was very different (s/o to anyone who was using dbt back then!). Adding `version: 2` allowed us to make this structure more extensible. Currently, Version 2 is the only supported version for these files. We kept `version:` around as a required key so that in the future, if we need to introduce a new structure for these files, we'll be able to do this more easily. + + diff --git a/website/docs/faqs/Project/yaml-file-extension.md b/website/docs/faqs/Project/yaml-file-extension.md index 6ba113db52c..accd16572ec 100644 --- a/website/docs/faqs/Project/yaml-file-extension.md +++ b/website/docs/faqs/Project/yaml-file-extension.md @@ -1,5 +1,5 @@ --- -title: Can I use a yaml file extension? +title: Can I use a YAML file extension? description: "dbt will only search for files with a `.yml` file extension" sidebar_label: '.yml file extension search' id: yaml-file-extension diff --git a/website/docs/faqs/Runs/run-downstream-of-seed.md b/website/docs/faqs/Runs/run-downstream-of-seed.md index bbf295f403d..cd8f45823ae 100644 --- a/website/docs/faqs/Runs/run-downstream-of-seed.md +++ b/website/docs/faqs/Runs/run-downstream-of-seed.md @@ -6,7 +6,7 @@ id: run-downstream-of-seed --- -You can run models downstream of a seed using the [model selection syntax](node-selection/syntax), and treating the seed like a model. +You can run models downstream of a seed using the [model selection syntax](/reference/node-selection/syntax), and treating the seed like a model. For example, the following would run all models downstream of a seed named `country_codes`: diff --git a/website/docs/faqs/Runs/run-one-model.md b/website/docs/faqs/Runs/run-one-model.md index dfbb54f2087..58fd2315cca 100644 --- a/website/docs/faqs/Runs/run-one-model.md +++ b/website/docs/faqs/Runs/run-one-model.md @@ -12,4 +12,4 @@ To run one model, use the `--select` flag (or `-s` flag), followed by the name o $ dbt run --select customers ``` -Check out the [model selection syntax documentation](node-selection/syntax) for more operators and examples. +Check out the [model selection syntax documentation](/reference/node-selection/syntax) for more operators and examples. diff --git a/website/docs/faqs/Runs/run-one-snapshot.md b/website/docs/faqs/Runs/run-one-snapshot.md index 7029f3c02da..5ed1c7c6912 100644 --- a/website/docs/faqs/Runs/run-one-snapshot.md +++ b/website/docs/faqs/Runs/run-one-snapshot.md @@ -12,4 +12,4 @@ To run one snapshot, use the `--select` flag, followed by the name of the snapsh $ dbt snapshot --select order_snapshot ``` -Check out the [model selection syntax documentation](node-selection/syntax) for more operators and examples. +Check out the [model selection syntax documentation](/reference/node-selection/syntax) for more operators and examples. diff --git a/website/docs/faqs/Runs/running-models-downstream-of-source.md b/website/docs/faqs/Runs/running-models-downstream-of-source.md index 9a56dd81294..ba39ab3d8ba 100644 --- a/website/docs/faqs/Runs/running-models-downstream-of-source.md +++ b/website/docs/faqs/Runs/running-models-downstream-of-source.md @@ -18,4 +18,4 @@ To run models downstream of one source : $ dbt run --select source:jaffle_shop.orders+ ``` -Check out the [model selection syntax](node-selection/syntax) for more examples! +Check out the [model selection syntax](/reference/node-selection/syntax) for more examples! diff --git a/website/docs/faqs/Seeds/build-one-seed.md b/website/docs/faqs/Seeds/build-one-seed.md index 974258cbe19..a971c67c529 100644 --- a/website/docs/faqs/Seeds/build-one-seed.md +++ b/website/docs/faqs/Seeds/build-one-seed.md @@ -15,6 +15,6 @@ $ dbt seed --select country_codes There is also an `--exclude` option. -Check out more in the [model selection syntax](node-selection/syntax) documentation. +Check out more in the [model selection syntax](/reference/node-selection/syntax) documentation. Prior to v0.16.0, there was no way to build one seed at a time. diff --git a/website/docs/faqs/Seeds/seed-custom-schemas.md b/website/docs/faqs/Seeds/seed-custom-schemas.md index 66b17807904..b03936e5735 100644 --- a/website/docs/faqs/Seeds/seed-custom-schemas.md +++ b/website/docs/faqs/Seeds/seed-custom-schemas.md @@ -17,9 +17,9 @@ name: jaffle_shop seeds: jaffle_shop: - schema: mappings # all seeds in this project will use the mapping schema by default + schema: mappings # all seeds in this project will use the schema "mappings" by default marketing: - schema: marketing # seeds in the `seeds/mapping/ subdirectory will use the marketing schema + schema: marketing # seeds in the "seeds/marketing/" subdirectory will use the schema "marketing" ``` diff --git a/website/docs/faqs/Snapshots/snapshot-hooks.md b/website/docs/faqs/Snapshots/snapshot-hooks.md index f60bd48e4c0..7b09a869a0e 100644 --- a/website/docs/faqs/Snapshots/snapshot-hooks.md +++ b/website/docs/faqs/Snapshots/snapshot-hooks.md @@ -7,8 +7,7 @@ id: snapshot-hooks --- Yes! The following hooks are available for snapshots: - - [pre-hooks](/reference/resource-configs/pre-hook-post-hook/) - [post-hooks](/reference/resource-configs/pre-hook-post-hook/) -- [on-run-start](/reference/resource-configs/pre-hook-post-hook/) -- [on-run-end](/reference/resource-configs/pre-hook-post-hook/) +- [on-run-start](/reference/project-configs/on-run-start-on-run-end/) +- [on-run-end](/reference/project-configs/on-run-start-on-run-end/) diff --git a/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md b/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md new file mode 100644 index 00000000000..5ce8f380008 --- /dev/null +++ b/website/docs/faqs/Snapshots/snapshot-target-is-not-a-snapshot-table.md @@ -0,0 +1,29 @@ +--- +title: "Debug Snapshot target is not a snapshot table errors" +description: "Debugging Snapshot target is not a snapshot table" +sidebar_label: "Snapshot target is not a snapshot table" +id: snapshot-target-is-not-a-snapshot-table +--- + +If you see the following error when you try executing the snapshot command: + +> Snapshot target is not a snapshot table (missing `dbt_scd_id`, `dbt_valid_from`, `dbt_valid_to`) + +Double check that you haven't inadvertently caused your snapshot to behave like table materializations by setting its `materialized` config to be `table`. Prior to dbt version 1.4, it was possible to have a snapshot like this: + +```sql +{% snapshot snappy %} + {{ config(materialized = 'table', ...) }} + ... +{% endsnapshot %} +``` + +dbt is treating snapshots like tables (issuing `create or replace table ...` statements) **silently** instead of actually snapshotting data (SCD2 via `insert` / `merge` statements). When upgrading to dbt versions 1.4 and higher, dbt now raises a Parsing Error (instead of silently treating snapshots like tables) that reads: + +``` +A snapshot must have a materialized value of 'snapshot' +``` + +This tells you to change your `materialized` config to `snapshot`. But when you make that change, you might encounter an error message saying that certain fields like `dbt_scd_id` are missing. This error happens because, previously, when dbt treated snapshots as tables, it didn't include the necessary [snapshot meta-fields](/docs/build/snapshots#snapshot-meta-fields) in your target table. Since those meta-fields don't exist, dbt correctly identifies that you're trying to create a snapshot in a table that isn't actually a snapshot. + +When this happens, you have to start from scratch — re-snapshotting your source data as if it was the first time by dropping your "snapshot" which isn't a real snapshot table. Then dbt snapshot will create a new snapshot and insert the snapshot meta-fields as expected. diff --git a/website/docs/faqs/Snapshots/snapshotting-freshness-for-one-source.md b/website/docs/faqs/Snapshots/snapshotting-freshness-for-one-source.md index c1cc7687b11..595dbab0f11 100644 --- a/website/docs/faqs/Snapshots/snapshotting-freshness-for-one-source.md +++ b/website/docs/faqs/Snapshots/snapshotting-freshness-for-one-source.md @@ -6,7 +6,6 @@ id: snapshotting-freshness-for-one-source --- -:::info As of dbt Core v0.21, you need to prefix sources with the source: selection method. In previous versions of dbt, sources were specified by name only. ::: Use the `--select` flag to snapshot freshness for specific sources. Eg: @@ -21,4 +20,4 @@ $ dbt source freshness --select source:jaffle_shop.orders $ dbt source freshness --select source:jaffle_shop.orders source:jaffle_shop.customers ``` -See the [`source freshness` command reference](commands/source) for more information. +See the [`source freshness` command reference](/reference/commands/source) for more information. diff --git a/website/docs/faqs/Tests/configurable-data-path.md b/website/docs/faqs/Tests/configurable-data-path.md index bacc732433b..7663d2d3f11 100644 --- a/website/docs/faqs/Tests/configurable-data-path.md +++ b/website/docs/faqs/Tests/configurable-data-path.md @@ -6,12 +6,6 @@ id: configurable-data-path --- - - -- **v1.0.0:** The config 'data-paths' has been deprecated in favor of [`seed-paths`](seed-paths). - - - By default, dbt expects your seed files to be located in the `seeds` subdirectory of your project. diff --git a/website/docs/faqs/Tests/custom-test-thresholds.md b/website/docs/faqs/Tests/custom-test-thresholds.md index 14dd36666c0..7155b39d25e 100644 --- a/website/docs/faqs/Tests/custom-test-thresholds.md +++ b/website/docs/faqs/Tests/custom-test-thresholds.md @@ -10,5 +10,5 @@ As of `v0.20.0`, you can use the `error_if` and `warn_if` configs to set custom For dbt `v0.19.0` and earlier, you could try these possible solutions: -* Setting the [severity](resource-properties/tests#severity) to `warn`, or: -* Writing a [custom generic test](custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966)) +* Setting the [severity](/reference/resource-properties/tests#severity) to `warn`, or: +* Writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) that accepts a threshold argument ([example](https://discourse.getdbt.com/t/creating-an-error-threshold-for-schema-tests/966)) diff --git a/website/docs/faqs/Tests/test-one-model.md b/website/docs/faqs/Tests/test-one-model.md index 16f80e6934e..9774c732821 100644 --- a/website/docs/faqs/Tests/test-one-model.md +++ b/website/docs/faqs/Tests/test-one-model.md @@ -12,4 +12,4 @@ Running tests on one model looks very similar to running a model: use the `--sel dbt test --select customers ``` -Check out the [model selection syntax documentation](node-selection/syntax) for full syntax, and [test selection examples](test-selection-examples) in particular. +Check out the [model selection syntax documentation](/reference/node-selection/syntax) for full syntax, and [test selection examples](/reference/node-selection/test-selection-examples) in particular. diff --git a/website/docs/faqs/Tests/testing-seeds.md b/website/docs/faqs/Tests/testing-seeds.md index 75c7fcf0ff1..3b1b3e0df56 100644 --- a/website/docs/faqs/Tests/testing-seeds.md +++ b/website/docs/faqs/Tests/testing-seeds.md @@ -6,9 +6,7 @@ id: testing-seeds --- -The `seeds:` key is new in 0.16.0. Prior to this, use a `models:` key instead. - -To test and document seeds, use a [schema file](configs-and-properties) and nest the configurations under a `seeds:` key +To test and document seeds, use a [schema file](/reference/configs-and-properties) and nest the configurations under a `seeds:` key ## Example diff --git a/website/docs/faqs/Tests/testing-sources.md b/website/docs/faqs/Tests/testing-sources.md index 06051eae7e9..8eb769026e5 100644 --- a/website/docs/faqs/Tests/testing-sources.md +++ b/website/docs/faqs/Tests/testing-sources.md @@ -26,12 +26,3 @@ And, to run tests on one source only: $ dbt test --select source:jaffle_shop.orders ``` -Yep, we know this syntax is a little less than ideal, so we're hoping to improve it in a future release. Check out the [model selection syntax](node-selection/syntax) for more examples! - - -:::info Node selection syntax -In dbt 0.21.0, the node selection syntax [was standardized](https://github.com/dbt-labs/dbt-core/pull/3791) to use `--select` everywhere. Before this, some commands like `dbt run` and `dbt test` used `--models` instead. - -Older versions still show an error because [argparse](https://docs.python.org/3/library/argparse.html#allow-abbrev) is expanding `--select` to `--selector`, which is a different flag. -To fix this issue, either upgrade to dbt 0.21.0 or higher, or use `--models` instead of `--select`. -::: diff --git a/website/docs/faqs/Troubleshooting/access-gdrive-credential.md b/website/docs/faqs/Troubleshooting/access-gdrive-credential.md index ca73c5c2631..64799291ee2 100644 --- a/website/docs/faqs/Troubleshooting/access-gdrive-credential.md +++ b/website/docs/faqs/Troubleshooting/access-gdrive-credential.md @@ -14,7 +14,7 @@ Access denied: BigQuery BigQuery: Permission denied while getting Drive credenti Usually this errors indicates that you haven't granted the BigQuery service account access to the specific Google Drive document. If you're seeing this error, try giving the service account (client email seen [here](https://docs.getdbt.com/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-bigquery)) you are using for your BigQuery connection in dbt Cloud, permission to your Google Drive or Google Sheet. You'll want to do this directly in your Google Document and click the 'share' button and enter the client email there. -If you are experiencing this error when using oAuth, and you have verified your access to the Google Sheet, you may need to grant permissions for gcloud to access Google Drive: +If you are experiencing this error when using OAuth, and you have verified your access to the Google Sheet, you may need to grant permissions for gcloud to access Google Drive: ``` gcloud auth application-default login --scopes=openid,https://www.googleapis.com/auth/userinfo.email,https://www.googleapis.com/auth/cloud-platform,https://www.googleapis.com/auth/sqlservice.login,https://www.googleapis.com/auth/drive diff --git a/website/docs/faqs/Troubleshooting/gitignore.md b/website/docs/faqs/Troubleshooting/gitignore.md index 47c7500e662..59fd4e8c866 100644 --- a/website/docs/faqs/Troubleshooting/gitignore.md +++ b/website/docs/faqs/Troubleshooting/gitignore.md @@ -1,26 +1,86 @@ --- -title: Why can't I checkout a branch or create a new branch? -description: "Add or fill in gitignore file" -sidebar_label: 'Unable to checkout or create branch' +title: How can I fix my .gitignore file? +description: "Use these instructions to fix your gitignore file" +sidebar_label: 'How to fix your .gitignore file' id: gitignore --- -If you're finding yourself unable to revert changes, check out a branch or click commit - this is usually do to your project missing a .[gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) file OR your gitignore file doesn't contain the necessary content inside the folder. +A gitignore file specifies which files Git should intentionally ignore. You can identify these files in your project by their italics formatting. -This is what causes that 'commit' git action button to display. No worries though - to fix this, you'll need to complete the following steps in order: +If you can't revert changes, check out a branch, or click commit — this is usually do to your project missing a [.gitignore](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) file OR your gitignore file doesn't contain the necessary content inside the folder. -1. In the Cloud IDE, add the missing .gitignore file or contents to your project. You'll want to make sure the .gitignore file includes the following: +To fix this, complete the following steps: - ```shell - target/ - dbt_modules/ - dbt_packages/ - logs/ - ``` + -2. Once you've added that, make sure to save and commit. +1. In the dbt Cloud IDE, add the following [.gitignore contents](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) in your dbt project `.gitignore` file: +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` +2. Save your changes but _don't commit_ +3. Restart the IDE by clicking on the three dots next to the **IDE Status button** on the lower right of the IDE. -3. Navigate to the same branch in your remote repository (which can be accessed directly through your git provider's web interface) and delete the logs, target, and dbt_modules/dbt_packages folders. + -4. Go back into the Cloud IDE and reclone your repository. This can be done by clicking on the green "ready" in the bottom right corner of the IDE (next to the command bar), and then clicking the orange "reclone repo" button in the pop up. +4. Select **Restart IDE**. +5. Go back to your dbt project and delete the following files or folders if you have them: + * `target`, `dbt_modules`, `dbt_packages`, `logs` +6. **Save** and then **Commit and sync** your changes. +7. Restart the IDE again. +8. Create a pull request (PR) under the **Version Control** menu to integrate your new changes. +9. Merge the PR on your git provider page. +10. Switch to your main branch and click on **Pull from remote** to pull in all the changes you made to your main branch. You can verify the changes by making sure the files/folders in the .gitignore file are in italics. + + + + + + + +1. In the dbt Cloud IDE, add the following [.gitignore contents](https://github.com/dbt-labs/dbt-starter-project/blob/main/.gitignore) in your dbt project `.gitignore` file: +```bash +target/ +dbt_packages/ +logs/ +# legacy -- renamed to dbt_packages in dbt v1 +dbt_modules/ +``` +2. Go to your `dbt_project.yml` file and add `tmp/` after your `target-path:` and add `log-path: "tmp/logs"`. + * So it should look like: `target-path: "tmp/target"` and `log-path: "tmp/logs"`: + + + +3. Save your changes but _don't commit_. +4. Restart the IDE by clicking on the three dots next to the **IDE Status button** on the lower right of the IDE. + + + +5. Select **Restart IDE**. +6. Go back to your dbt project and delete the following four folders (if you have them): + * `target` + * `dbt_modules` + * `dbt_packages` + * `logs` +7. **Save** and then **Commit and sync** your changes. +8. Go back to your `dbt_project.yml` file and undo the modifications you made in **Step 2**. + + * Remove `tmp` from your `target-path` and completely remove the `log-path: "tmp/logs"` line. + + + +9. Restart the IDE again. +10. Delete the `tmp` folder in the **File Explorer**. +11. Create a pull request (PR) under the **Version Control** menu to integrate your new changes. +12. Merge the PR in your git provider page. +13. Switch to your main branch and click on **Pull from remote** to pull in all the changes you made to your main branch. You can verify the changes by making sure the files/folders in the .gitignore file are in italics. + + + + + +For more info, refer to this [detailed video](https://www.loom.com/share/9b3b8e2b617f41a8bad76ec7e42dd014) for additional guidance. diff --git a/website/docs/faqs/Warehouse/db-connection-dbt-compile.md b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md new file mode 100644 index 00000000000..d8e58155b10 --- /dev/null +++ b/website/docs/faqs/Warehouse/db-connection-dbt-compile.md @@ -0,0 +1,33 @@ +--- +title: Why dbt compile needs a data platform connection +description: "`dbt compile` needs a data platform connection because the work it does depends on the current state of your warehouse" +sidebar_label: "Why dbt compile needs a data platform connection" +id: db-connection-dbt-compile +--- + +`dbt compile` needs a data platform connection in order to gather the info it needs (including from introspective queries) to prepare the SQL for every model in your project. + +### dbt compile + +The [`dbt compile` command](/reference/commands/compile) generates executable SQL from `source`, `model`, `test`, and `analysis` files. `dbt compile` is similar to `dbt run` except that it doesn't materialize the model's compiled SQL into an existing table. So, up until the point of materialization, `dbt compile` and `dbt run` are similar because they both require a data platform connection, run queries, and have an [`execute` variable](/reference/dbt-jinja-functions/execute) set to `True`. + +However, here are some things to consider: + +- You don't need to execute `dbt compile` before `dbt run` +- In dbt, `compile` doesn't mean `parse`. This is because `parse` validates your written `YAML`, configured tags, and so on. + +### Introspective queries + +To generate the compiled SQL for many models, dbt needs to run introspective queries, (which is when dbt needs to run SQL in order to pull data back and do something with it) against the data platform. + +These introspective queries include: + +- Populating the [relation cache](/guides/advanced/creating-new-materializations#update-the-relation-cache). Caching speeds up the metadata checks, including whether an [incremental model](/docs/build/incremental-models) already exists in the data platform. +- Resolving [macros](/docs/build/jinja-macros#macros), such as `run_query` or `dbt_utils.get_column_values` that you're using to template out your SQL. This is because dbt needs to run those queries during model SQL compilation. + +Without a data platform connection, dbt can't perform these introspective queries and won't be able to generate the compiled SQL needed for the next steps in the dbt workflow. You can [`parse`](/reference/commands/parse) a project and use the [`list`](/reference/commands/list) resources in the project, without an internet or data platform connection. Parsing a project is enough to produce a [manifest](/reference/artifacts/manifest-json), however, keep in mind that the written-out manifest won't include compiled SQL. + +To configure a project, you do need a [connection profile](/docs/core/connect-data-platform/connection-profiles) (`profiles.yml` if using the CLI). You need this file because the project's configuration depends on its contents. For example, you may need to use [`{{target}}`](/reference/dbt-jinja-functions/target) for conditional configs or know what platform you're running against so that you can choose the right flavor of SQL. + + + diff --git a/website/docs/faqs/Warehouse/sample-profiles.md b/website/docs/faqs/Warehouse/sample-profiles.md index 57287a7d97b..130e32e706b 100644 --- a/website/docs/faqs/Warehouse/sample-profiles.md +++ b/website/docs/faqs/Warehouse/sample-profiles.md @@ -6,4 +6,4 @@ id: sample-profiles --- -The structure of a profile looks different on each warehouse. Check out the [Supported Data Platforms](supported-data-platforms) page, and navigate to the `Profile Setup` section for your warehouse. +The structure of a profile looks different on each warehouse. Check out the [Supported Data Platforms](/docs/supported-data-platforms) page, and navigate to the `Profile Setup` section for your warehouse. diff --git a/website/docs/guides/legacy/creating-new-materializations.md b/website/docs/guides/advanced/creating-new-materializations.md similarity index 87% rename from website/docs/guides/legacy/creating-new-materializations.md rename to website/docs/guides/advanced/creating-new-materializations.md index 05cf790a297..d3081ea8e20 100644 --- a/website/docs/guides/legacy/creating-new-materializations.md +++ b/website/docs/guides/advanced/creating-new-materializations.md @@ -1,11 +1,14 @@ --- title: "Creating new materializations" id: "creating-new-materializations" +description: Learn how to create your own materializations. +displayText: Creating new materializations +hoverSnippet: Learn how to create your own materializations. --- ## Overview -The model materializations you're familiar with, `table`, `view`, and `incremental` are implemented as macros in a package that's distributed along with dbt. You can check out the source for these materializations [here](https://github.com/dbt-labs/dbt-core/tree/dev/louisa-may-alcott/core/dbt/include/global_project/macros/materializations). If you need to create your own materializations, reading these files is a good place to start. Continue reading below for a deep-dive into dbt materializations. +The model materializations you're familiar with, `table`, `view`, and `incremental` are implemented as macros in a package that's distributed along with dbt. You can check out the [source code for these materializations](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/materializations). If you need to create your own materializations, reading these files is a good place to start. Continue reading below for a deep-dive into dbt materializations. :::caution @@ -76,7 +79,7 @@ Materializations are responsible for creating new tables or view In this example, the `get_relation` method is used to fetch the state of the currently-executing model from the database. If the model exists as a view, then the view is dropped to make room for the table that will be built later in the materialization. -This is a simplified example, and the setup phase for a materialization can become quite complicated indeed! When building a materialization, be sure to consider the state of the database and any supplied [flags](flags) (ie. `--full-refresh`) to ensure that the materialization code behaves correctly in different scenarios. +This is a simplified example, and the setup phase for a materialization can become quite complicated indeed! When building a materialization, be sure to consider the state of the database and any supplied [flags](/reference/dbt-jinja-functions/flags) (ie. `--full-refresh`) to ensure that the materialization code behaves correctly in different scenarios. ### Run pre-hooks @@ -114,7 +117,7 @@ The ability to synchronize the Relation cache is new in dbt v0.15.0 ::: -Materializations should [return](return) the list of Relations that they have created at the end of execution. dbt will use this list of Relations to update the relation cache in order to reduce the number of queries executed against the database's `information_schema`. If a list of Relations is not returned, then dbt will raise a Deprecation Warning and infer the created relation from the model's configured database, schema, and alias. +Materializations should [return](/reference/dbt-jinja-functions/return) the list of Relations that they have created at the end of execution. dbt will use this list of Relations to update the relation cache in order to reduce the number of queries executed against the database's `information_schema`. If a list of Relations is not returned, then dbt will raise a Deprecation Warning and infer the created relation from the model's configured database, schema, and alias. @@ -152,7 +155,7 @@ To explicitly remove a relation from the cache, use [adapter.drop_relation](/ref ## Materialization Configuration -Materializations support custom configuration. You might be familiar with some of these configs from materializations like `unique_key` in [incremental models](/docs/build/incremental-models) or `strategy` in [snapshots](snapshots) . +Materializations support custom configuration. You might be familiar with some of these configs from materializations like `unique_key` in [incremental models](/docs/build/incremental-models) or `strategy` in [snapshots](/docs/build/snapshots) . ### Specifying configuration options @@ -162,7 +165,7 @@ Materialization configurations can either be "optional" or "required". If a user # optional config.get('optional_config_name', default="the default") # required -config.require('required_conifg_name') +config.require('required_config_name') ``` For more information on the `config` dbt Jinja function, see the [config](/reference/dbt-jinja-functions/config) reference. diff --git a/website/docs/docs/get-started/learning-more/using-jinja.md b/website/docs/guides/advanced/using-jinja.md similarity index 90% rename from website/docs/docs/get-started/learning-more/using-jinja.md rename to website/docs/guides/advanced/using-jinja.md index 0476b6aebc3..40cfd2af298 100644 --- a/website/docs/docs/get-started/learning-more/using-jinja.md +++ b/website/docs/guides/advanced/using-jinja.md @@ -34,7 +34,7 @@ group by 1 The SQL for each payment method amount is repetitive, which can be difficult to maintain for a number of reasons: * If the logic or field name were to change, the code would need to be updated in three places. * Often this code is created by copying and pasting, which may lead to mistakes. -* Other analysts that review the code are less likely to notice errors as its common to only scan through repeated code. +* Other analysts that review the code are less likely to notice errors as it's common to only scan through repeated code. So we're going to use Jinja to help us clean it up, or to make our code more "DRY" ("Don't Repeat Yourself"). @@ -152,7 +152,7 @@ Getting whitespace control right is often a lot of trial and error! We recommend ## Use a macro to return payment methods Here, we've hardcoded the list of payment methods in our model. We may need to access this list from another model. A good solution here is to use a [variable](/docs/build/project-variables), but for the purpose of this tutorial, we're going to instead use a macro! -[Macros](jinja-macros#macros) in Jinja are pieces of code that can be called multiple times – they are analogous to a function in Python, and are extremely useful if you find yourself repeating code across multiple models. +[Macros](/docs/build/jinja-macros#macros) in Jinja are pieces of code that can be called multiple times – they are analogous to a function in Python, and are extremely useful if you find yourself repeating code across multiple models. Our macro is simply going to return the list of payment methods: @@ -168,7 +168,7 @@ Our macro is simply going to return the list of payment methods: There's a few things worth noting here: * Normally, macros take arguments -- we'll see this later on, but for now, we still need to setup our macro with empty parentheses where the arguments would normally go (i.e. `get_payment_methods()`) -* We've used the [return](return) function to return a list – without this function, the macro would return a string. +* We've used the [return](/reference/dbt-jinja-functions/return) function to return a list – without this function, the macro would return a string. Now that we have a macro for our payment methods, we can update our model as follows: @@ -202,9 +202,9 @@ payment_method from {{ ref('raw_payments') }} order by 1 ``` -[Statements](statement-blocks) provide a way to run this query and return the results to your Jinja context. This means that the list of `payment_methods` can be set based on the data in your database rather than a hardcoded value. +[Statements](/reference/dbt-jinja-functions/statement-blocks) provide a way to run this query and return the results to your Jinja context. This means that the list of `payment_methods` can be set based on the data in your database rather than a hardcoded value. -The easiest way to use a statement is through the [run_query](run_query) macro. For the first version, let's check what we get back from the database, by logging the results to the command line using the [log](log) function. +The easiest way to use a statement is through the [run_query](/reference/dbt-jinja-functions/run_query) macro. For the first version, let's check what we get back from the database, by logging the results to the command line using the [log](/reference/dbt-jinja-functions/log) function. @@ -243,7 +243,7 @@ This is actually an [Agate table](https://agate.readthedocs.io/page/api/table.ht {% set payment_methods_query %} select distinct payment_method -from app_data.payments +from {{ ref('raw_payments') }} order by 1 {% endset %} @@ -262,7 +262,7 @@ order by 1 ``` There's a few tricky pieces in here: -* We used the [execute](execute) variable to ensure that the code runs during the `parse` stage of dbt (otherwise an error would be thrown). +* We used the [execute](/reference/dbt-jinja-functions/execute) variable to ensure that the code runs during the `parse` stage of dbt (otherwise an error would be thrown). * We used Agate methods to get the column back as a list Fortunately, our model code doesn't need to be updated, since we're already calling the macro to get the list of payment methods. And now, any new `payment_methods` added to the underlying data model will automatically be handled by the dbt model. diff --git a/website/docs/guides/legacy/custom-generic-tests.md b/website/docs/guides/best-practices/custom-generic-tests.md similarity index 85% rename from website/docs/guides/legacy/custom-generic-tests.md rename to website/docs/guides/best-practices/custom-generic-tests.md index 601e80a1254..f2d84e38853 100644 --- a/website/docs/guides/legacy/custom-generic-tests.md +++ b/website/docs/guides/best-practices/custom-generic-tests.md @@ -1,16 +1,12 @@ --- title: "Writing custom generic tests" id: "writing-custom-generic-tests" +description: Learn how to define your own custom generic tests. +displayText: Writing custom generic tests +hoverSnippet: Learn how to define your own custom generic tests. --- - - -* `v0.20.0`: Generic tests (f.k.a. schema tests) are defined using `test` blocks instead of macros prefixed `test_`. They return a number of failing rows, rather than a single numeric value. -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - -dbt ships with [Not Null](resource-properties/tests#not-null), [Unique](resource-properties/tests#unique), [Relationships](resource-properties/tests#relationships), and [Accepted Values](resource-properties/tests#accepted-values) generic tests. (These used to be called "schema tests," and you'll still see that name in some places.) Under the hood, these generic tests are defined as `test` blocks (like macros) in a globally accessible dbt project. You can find the source code for these tests in the [global project](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/include/global_project/). +dbt ships with [Not Null](/reference/resource-properties/tests#not-null), [Unique](/reference/resource-properties/tests#unique), [Relationships](/reference/resource-properties/tests#relationships), and [Accepted Values](/reference/resource-properties/tests#accepted-values) generic tests. (These used to be called "schema tests," and you'll still see that name in some places.) Under the hood, these generic tests are defined as `test` blocks (like macros) in a globally accessible dbt project. You can find the source code for these tests in the [global project](https://github.com/dbt-labs/dbt-core/tree/main/core/dbt/include/global_project/macros/generic_test_sql). :::info There are tons of generic tests defined in open source packages, such as [dbt-utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) and [dbt-expectations](https://hub.getdbt.com/calogica/dbt_expectations/latest/) — the test you're looking for might already be here! @@ -19,7 +15,7 @@ There are tons of generic tests defined in open source packages, such as [dbt-ut ### Generic tests with standard arguments Generic tests are defined in SQL files. Those files can live in two places: -- `tests/generic/`: that is, a special subfolder named `generic` within your [test paths](test-paths) (`tests/` by default) +- `tests/generic/`: that is, a special subfolder named `generic` within your [test paths](/reference/project-configs/test-paths) (`tests/` by default) - `macros/`: Why? Generic tests work a lot like macros, and historically, this was the only place they could be defined. If your generic test depends on complex macro logic, you may find it more convenient to define the macros and the generic test in the same file. To define your own generic tests, simply create a `test` block called ``. All generic tests should accept one or both of the standard arguments: diff --git a/website/docs/guides/legacy/debugging-errors.md b/website/docs/guides/best-practices/debugging-errors.md similarity index 92% rename from website/docs/guides/legacy/debugging-errors.md rename to website/docs/guides/best-practices/debugging-errors.md index 254f06b69e9..39670820ddd 100644 --- a/website/docs/guides/legacy/debugging-errors.md +++ b/website/docs/guides/best-practices/debugging-errors.md @@ -1,6 +1,12 @@ --- -title: Debugging errors +title: "Debugging errors" +id: "debugging-errors" +description: Learn about errors and the art of debugging them. +displayText: Debugging errors +hoverSnippet: Learn about errors and the art of debugging those errors. --- + + ## General process of debugging Learning how to debug is a skill, and one that will make you great at your role! 1. Read the error message — when writing the code behind dbt, we try our best to make error messages as useful as we can. The error message dbt produces will normally contain the type of error (more on these error types below), and the file where the error occurred. @@ -12,7 +18,7 @@ Learning how to debug is a skill, and one that will make you great at your role! - The `logs/dbt.log` file contains all the queries that dbt runs, and additional logging. Recent errors will be at the bottom of the file. - **dbt Cloud users**: Use the above, or the `Details` tab in the command output. - **dbt CLI users**: Note that your code editor _may_ be hiding these files from the tree [VSCode help](https://stackoverflow.com/questions/42891463/how-can-i-show-ignored-files-in-visual-studio-code)). -5. If you are really stuck, try [asking for help](getting-help). Before doing so, take the time to write your question well so that others can diagnose the problem quickly. +5. If you are really stuck, try [asking for help](/community/resources/getting-help). Before doing so, take the time to write your question well so that others can diagnose the problem quickly. ## Types of errors @@ -96,7 +102,7 @@ To view your profiles.yml file, run: open /Users/alice/.dbt ``` - - Then execute `open /Users/alice/.dbt` (adjusting accordingly), and check that you have a `profiles.yml` file. If you do not have one, set one up using [these docs](reference/profiles.yml.md) + - Then execute `open /Users/alice/.dbt` (adjusting accordingly), and check that you have a `profiles.yml` file. If you do not have one, set one up using [these docs](/docs/core/connect-data-platform/profiles.yml)
          @@ -169,7 +175,7 @@ hello: world # this is not allowed
          -- Use the reference section for [`dbt_project.yml` files](reference/dbt_project.yml.md) to correct this issue. +- Use the reference section for [`dbt_project.yml` files](/reference/dbt_project.yml.md) to correct this issue. - If you're using a key that is valid according to the documentation, check that you're using the latest version of dbt with `dbt --version`. @@ -297,7 +303,7 @@ Compilation Error - Open the file (e.g. `models/schema.yml`) as per the error message - Search for the offending key (e.g. `hello`, as per "**'hello'** was unexpected") -- Fix it. Use the [model properties](model-properties) docs to find valid keys +- Fix it. Use the [model properties](/reference/model-properties) docs to find valid keys - If you are using a valid key, check that you're using the latest version of dbt with `dbt --version` @@ -337,7 +343,7 @@ Database Error in model customers (models/customers.sql) - **dbt Cloud:** Open the model (in this case `models/customers.sql` as per the error message) - **dbt CLI:** Open the model as above. Also open the compiled SQL (in this case `target/run/jaffle_shop/models/customers.sql` as per the error message) — it can be useful to show these side-by-side in your code editor. 2. Try to re-execute the SQL to isolate the error: - - **dbt Cloud:** Use the `Run SQL` button from the model file + - **dbt Cloud:** Use the `Preview` button from the model file - **dbt CLI:** Copy and paste the compiled query into a query runner (e.g. the Snowflake UI, or a desktop app like DataGrip / TablePlus) and execute it 3. Fix the mistake. 4. Rerun the failed model. @@ -359,14 +365,14 @@ If you're hitting a strange `Database Error`, it can be a good idea to clean out ## Common pitfalls -### `run sql` vs. `dbt run` +### `Preview` vs. `dbt run` _(dbt Cloud IDE users only)_ There's two interfaces that look similar: -- The `run sql` button executes whatever SQL statement is in the active tab. It is the equivalent of grabbing the compiled `select` statement from the `target/compiled` directory and running it in a query editor to see the results. +- The `Preview` button executes whatever SQL statement is in the active tab. It is the equivalent of grabbing the compiled `select` statement from the `target/compiled` directory and running it in a query editor to see the results. - The `dbt run` command builds relations in your database -Using the `run sql` is useful when developing models and you want to visually inspect the results of a query. However, you'll need to make sure you have executed `dbt run` for any upstream models — otherwise dbt will try to select `from` tables and views that haven't been built. +Using the `Preview` button is useful when developing models and you want to visually inspect the results of a query. However, you'll need to make sure you have executed `dbt run` for any upstream models — otherwise dbt will try to select `from` tables and views that haven't been built. ### Forgetting to save files before running diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md new file mode 100644 index 00000000000..19c6717063c --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro.md @@ -0,0 +1,38 @@ +--- +title: "Intro to MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +:::tip +**This is a guide for a beta product.** We anticipate this guide will evolve alongside the Semantic Layer through community collaboration. We welcome discussions, ideas, issues, and contributions to refining best practices. +::: + +Flying cars, hoverboards, and true self-service analytics: this is the future we were promised. The first two might still be a few years out, but real self-service analytics is here today. With dbt Cloud's Semantic Layer, you can resolve the tension between accuracy and flexibility that has hampered analytics tools for years, empowering everybody in your organization to explore a shared reality of metrics. Best of all for analytics engineers, building with these new tools will significantly [DRY](https://docs.getdbt.com/terms/dry) up and simplify your codebase. As you'll see, the deep interaction between your dbt models and the Semantic Layer make your dbt project the ideal place to craft your metrics. + +## Learning goals + +- ❓ Understand the **purpose and capabilities** of the **dbt Semantic Layer**, particularly MetricFlow as the engine that powers it. +- 🧱 Familiarity with the core components of MetricFlow — **semantic models and metrics** — and how they work together. +- 🛠️ Hands-on **experience building** semantic models and metrics in dbt Cloud. +- 🔁 Know how to **refactor** models for MetricFlow. +- 🏅 Aware of new **best practices** to take maximum advantage of the Semantic Layer. + +## Guide structure overview + +We'll work through our learning goals via an [example project](https://github.com/dbt-labs/jaffle-sl-template), we encourage you to follow along and try the code out for yourself if you'd like on the `start-here` branch, or you can just follow along with the completed state of the codebase on the `main` branch. + +1. Getting **setup** with MetricFlow in your dbt project. +2. Building your first **semantic model** and its fundamental parts: **entities, dimensions, and measures**. +3. Building your first **metric**. +4. **Refactoring** a mart into the Semantic Layer. +5. Defining **advanced metrics**: `ratio` and `derived` types. +6. Review **best practices**. + +If you're ready to ship your users more power with less code, let's dive in! + +:::info +MetricFlow is a new way to define metrics in dbt and one of the key components of the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl). It handles SQL query construction and defines the specification for dbt semantic models and metrics. + +To fully experience the dbt Semantic Layer, including the ability to query dbt metrics via external integrations, you'll need a [dbt Cloud Team or Enterprise account](https://www.getdbt.com/pricing/). +::: diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md new file mode 100644 index 00000000000..801227924dd --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-2-setup.md @@ -0,0 +1,43 @@ +--- +title: "Set up MetricFlow" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Getting started + +First, if you want to follow along, we'll need to clone the [example project](https://github.com/dbt-labs/jaffle-sl-template). You will need access to a Snowflake, BigQuery, Databricks, or Postgres warehouse for this, for the time being. The project is our classic Jaffle Shop, a simulated chain restaurant serving [jaffles](https://en.wikipedia.org/wiki/Pie_iron) and tasty beverages. + +```shell +git clone git@github.com:dbt-labs/jaffle-sl-template.git +cd path/to/project +``` + +Next, before you start writing code, you need to install MetricFlow as an extension of a dbt adapter from PyPI (dbt Core users only). The MetricFlow is compatible with Python versions 3.8 through 3.11. + +We'll use pip to install MetricFlow and our dbt adapter: + +```shell +# activate a virtual environment for your project, +# if you don't have a name you like to use we suggest .venv +python -m venv [virtual environment name] +source [virtual environment name]/bin/activate +# install dbt and MetricFlow +pip install "dbt-metricflow[adapter name]" +# e.g. pip install "dbt-metricflow[snowflake]" +``` + +Lastly, to get to the pre-Semantic Layer starting state, checkout the `start-here` branch. + +```shell +git checkout start-here +``` + +For more information, refer to the [MetricFlow commands](/docs/build/metricflow-commands) or a [quickstart](/quickstarts) to get more familiar with setting up a dbt project. + +## Basic commands + +- 💻 This package will install both `dbt` and `mf` as CLIs in our virtual environment. All the regular `dbt` commands like `run`, `build`, and `test` are available. +- 🔍 A less common one that will come in handy with the Semantic Layer is `dbt parse`. This will parse your project and generate a **semantic manifest**, a representation of meaningful connections described by your project. This file gives MetricFlow a **state of the world from which to generate queries**. +- 🧰 In addition to `dbt`, you'll have access to `mf` commands like `query` and `validate-configs`, which operate based on that semantic manifest. We'll dig more into all of these as we go along. +- 🛠️ Lets start off by running a `dbt build` to get the **starting state** of our project built. diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md new file mode 100644 index 00000000000..a2dc55e37ae --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-3-build-semantic-models.md @@ -0,0 +1,296 @@ +--- +title: "Building semantic models" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build a semantic model + +A semantic model is the MetricFlow equivalent to a logical layer model (what historically has just been called a 'model' in dbt land). Just as configurations for models are defined on the `models:` YAML key, configurations for semantic models are housed under `semantic models:`. A key difference is that while a logical model consists of configuration and SQL or Python code, a **semantic model is defined purely via YAML**. Rather than encoding a specific dataset, a **semantic model describes relationships** that let your end users select and refine their own datasets reliably. + +- ⚙️ Semantic models are **comprised of three components**: + - 🫂 **entities**: these describe the **relationships** between various semantic models (think ids) + - 🪣 **dimensions**: these are the columns you want to **slice, dice, group, and filter by** (think timestamps, categories, booleans). + - 📏 **measures**: these are the **quantitative values you want to aggregate** +- 📚 We define **columns as being an entity, dimension, or measure**. + +:::tip +**File per model**. Given the interdependence of logical and semantic models, and semantic models and metrics, we've updated our best practice recommendation to a one YAML file per model approach if you're using the Semantic Layer. This houses everything related to a model in one place and preserves unique file names for quickly getting to the code you want. +::: + +## Defining orders + +- 🥪 The semantic model we're going to define is _orders_. +- 📗 We define it as a **YAML dictionary in the semantic models list**. +- 📑 It will have a **name, entities list, dimensions list, and measures list**. +- ⏬ We recommend defining them **in this order consistently** as a style best practice. + +```YAML +semantic_models: + - name: orders + entities: + ... + dimensions: + ... + measures: + ... +``` + +- Next we'll point to the corresponding logical model by supplying a [`ref`](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) in the `model:` property, and a `description` for documentation. + +```YAML +semantic_models: + - name: orders + description: | + Model containing order data. The grain of the table is the order id. + model: ref('stg_orders') + entities: + ... + dimensions: + ... + measures: + ... +``` + +## Establishing our entities + +- 🫂 Entities are the **objects and concepts** in our data that _have_ dimensions and measures. You can think of them as the **nouns** of our project, the **spines** of our queries that we may want to aggregate by, or simply the **join keys**. +- 🔀 Entities help MetricFlow understand **how various semantic models relate to one another**. +- ⛓️ Unlike many other semantic layers, in MetricFlow **we do not need to describe joins explicitly**, instead the **relationships are implicitly described by entities**. +- 1️⃣ Each semantic model should have **one primary entity** defined for itself, and **any number of foreign entities** for other semantic models it may join to. +- 🫂 Entities require a **name and type** + - 🔑 Types available are **primary**, **foreign**, **unique** or **natural** — we'll be focused on the first two for now, but you can [read more about unique and natural keys](https://docs.getdbt.com/docs/build/entities#entity-types). + +### Entities in action + +If we look at the staging model for orders, we see that it has 3 id columns, so we'll need three entities. + +```SQL +renamed as ( + + select + + ---------- ids + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- properties + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps + ordered_at + + from source +``` + +- 👉 We add them with a **`name`, `type`, and optional `expr`** (expression). The expression can be any valid SQL expression on your platform. +- 📛 If you **don't add an expression**, MetricFlow will **assume the name is equal to the column name** in the underlying logical model. +- 👍 Our best practices pattern is to, whenever possible, provide a `name` that is the singular form of the subject or grain of the table, and use `expr` to specify the precise column name (with `_id` etc). This will let us write **more readable metrics** on top of these semantic models. + +```YAML +semantic_models: + - name: orders + ... + entities: + # we use the column for the name here because order is a reserved word in SQL + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + ... + measures: + ... + +``` + +## Defining our dimensions + +- 🧮 Dimensions are the columns that we want to **filter and group by**, **the adjectives of our project**. They come in three types: + - **categorical** + - **time** + - slowly changing dimensions — [these are covered in the documentation](https://docs.getdbt.com/docs/build/dimensions#scd-type-ii), and a little more complex. To focus on building your mental models of MetricFlow's fundamentals, we won't be using SCDs in this guide. +- ➕ We're **not limited to existing columns**, we can use the `expr` property to add simple computations in our dimensions. +- 📛 Categorical dimensions are the simplest, they simply require a `name` and `type` (type being categorical). **If the `name` property matches the name of the dimension column**, that's it, you're done. If you want or need to use a `name` other than the column name, or do some filtering or computation, **you can supply an optional `expr` property** to evaluate for the dimension. + +### Dimensions in action + +- 👀 Let's look at our staging model again and see what fields we have available. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ⏰ For now the only dimension to add is a **time dimension**. +- 🕰️ At least one **primary time dimension** is **required** for any semantic models that **have measures**. +- 1️⃣ We denote this with the `is_primary` property, or if there is only a one-time dimension supplied it is primary by default. Below we only have `ordered_at` as a timestamp so we don't need to specify anything except the maximum granularity we're bucketing to (in this case, day). + +```YAML +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using [BigQuery](/docs/build/dimensions#time) + type: time + type_params: + time_granularity: day +``` + +:::tip +**Dimensional models**. You may have some models that do not contain measures, just dimensional data that enriches other facts. That's totally fine, a semantic model does not require dimensions or measures, it just needs a primary entity, and if you do have measures, a primary time dimension. + +We'll discuss an alternate situation, dimensional tables that have static numeric values like supply costs or tax rates but no time dimensions, later in the Guide. +::: + +- 🔢 We can also **make a dimension out of a numeric column** that would typically be a measure. +- 🪣 Using `expr` we can **create buckets of values that we label** for our dimension. We'll add one of these in for labeling 'large orders' as any order totals over $50. + +```YAML +... +dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end +... +``` + +## Making our measures + +- 📏 Measures are the final component of a semantic model. They describe the **numeric values that we want to aggregate**. +- 🧱 Measures form **the building blocks of metrics**, with entities and dimensions helping us combine, group, and filter those metrics correctly. +- 🏃 You can think of them as something like the **verbs of a semantic model**. + +### Measures in action + +- 👀 Let's look at **our staging model** one last time and see what **fields we want to measure**. + +```SQL +select + + ---------- ids -> entities + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- numerics -> measures + (order_total / 100.0) as order_total, + (tax_paid / 100.0) as tax_paid, + + ---------- timestamps -> dimensions + ordered_at + +from source +``` + +- ➕ Here `order_total` and `tax paid` are the **columns we want as measures**. +- 📝 We can describe them via the code below, specifying a **name, description, aggregation, and expression**. +- 👍 As before MetricFlow we default to the **name being the name of a column when no expression is supplied**. +- 🧮 [Many different aggregations](https://docs.getdbt.com/docs/build/measures#aggregation) are available to us. Here we just want sums. + +```YAML +measures: + - name: order_total + description: The total amount for each order including taxes. + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🆕 We can also **create new measures using expressions**, for instance adding a count of individual orders as below. + +```YAML + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum +``` + +## Validating configs + +Our completed code should look like this, our first semantic model! + +```orders +semantic_models: + - name: orders + defaults: + agg_time_dimension: ordered_at + description: | + Order fact table. This table is at the order grain with one row per order. + + model: ref('stg_orders') + + entities: + - name: order_id + type: primary + - name: location + type: foreign + expr: location_id + - name: customer + type: foreign + expr: customer_id + + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + # use date_trunc(ordered_at, DAY) if using BigQuery + type: time + type_params: + time_granularity: day + - name: is_large_order + type: categorical + expr: case when order_total > 50 then true else false end + + measures: + - name: order_total + description: The total revenue for each order. + agg: sum + - name: order_count + description: The count of individual orders. + expr: 1 + agg: sum + - name: tax_paid + description: The total tax paid on each order. + agg: sum +``` + +- 🦺 We can check that it's a valid configuration and works with the real data our dbt project is generating by using the `mf validate-configs` command. This will: + 1. **Parse the semantic manifest** our configuration describes out of the dbt project. + 2. Validate the **internal semantics** of the manifest as described by our code. + 3. Validate the **external semantics** of the manifest against your data warehouse (e.g. making sure that a column specified as a dimension exists on the proper table) + +## Review and next steps + +Let's review the basics of semantic models: + +- 🧱 Consist off **entities, dimensions, and measures**. +- 🫂 Describe the **semantics and relationships of objects** in the warehouse. +- 1️⃣ Correspond to a **single logical model** in your dbt project. + +Next up, let's use our new semantic model to **build a metric**! diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md new file mode 100644 index 00000000000..cd0efdc9e64 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-4-build-metrics.md @@ -0,0 +1,41 @@ +--- +title: "Building metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## How to build metrics + +- 💹 We'll start with one of the most important metrics for any business: **revenue**. +- 📖 For now, our metric for revenue will be **defined as the sum of order totals excluding tax**. +- 🆕 Let's create a file called `metrics.yml` in our marts folder for now to write our first metric in. + +## Defining revenue + +- 🔢 Metrics have four basic properties: + - `name:` We'll use 'revenue' to reference this metric. + - `description:` For documentation. + - `label:` The display name for the metric in downstream tools. + - `type:` one of `simple`, `ratio`, or `derived`. +- 🎛️ Each type has different `type_params`. +- 🛠️ We'll build a **simple metric** first to get the hang of it, and move on to ratio and derived metrics later. +- 📏 Simple metrics are built on a **single measure defined as a type parameter**. +- 🔜 Defining **measures as their own distinct component** on semantic models is critical to allowing the **flexibility of more advanced metrics**, though simple metrics act mainly as **pass-through that provide filtering** and labeling options. A `create_metric` option for measures is coming in the next version of MetricFlow to **save you writing extra code** for simple metrics that make no changes to the underlying measure. + +```YAML +metrics: + - name: revenue + description: Sum of the order total. + label: Revenue + type: simple + type_params: + measure: order_total +``` + +## Query your metric + +- It's best practice any time we're updating our semantic layer code to run a `dbt parse && mf validate-configs`. +- If everything passes, we can start querying this metric with `mf query`! +- `mf query` is not how you would use the tool in production, that's handled by the dbt Cloud Semantic Layer's features. It's available for testing results of various metric queries in development, exactly as we're using it now. +- Try `mf query --metrics revenue --group-by metric_time__day` and see a preview of the data come back. +- Note the structure of the above query. We select the metric(s) we want and the dimensions to group them by — we use dunders (double underscores e.g.`metric_time__[time bucket]`) to designate time dimensions or other non-unique dimensions that need a specified entity path to resolve (e.g. if you have a orders location dimension and a employee location dimension both named 'location' you would need dunders to specify `orders__location` or `employee__location`). diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md new file mode 100644 index 00000000000..b2efb39e9fc --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-5-refactor-a-mart.md @@ -0,0 +1,242 @@ +--- +title: "Refactor an existing mart" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## A new approach + +We've covered the basics, now it's time to dig in to the fun and messy part: how do we refactor an existing mart in dbt into semantic models and metrics? + +Let's look at the differences we can observe in how we might approach this with MetricFlow supercharging dbt versus how we work without a Semantic Layer. These differences can then inform our structure. + +- 🍊 In dbt, we tend to create **highly denormalized datasets** that bring **everything you want around a certain entity or process into a single table**. +- 💜 The problem is, this **limits the dimensionality available to MetricFlow**. The more we pre-compute and 'freeze' into place, the less flexible our data is. +- 🚰 In MetricFlow, we ideally want **highly normalized**, star schema-like data that then allows MetricFlow to shine as a **denormalization engine**. +- ∞ Another way to think about this is that instead of moving down a list of requested priorities trying to pre-make as many combinations of our marts as possible — increasing lines of code and complexity — we can **let MetricFlow present every combination possible without specifically coding it**. +- 🏗️ To resolve these approaches optimally, we'll need to shift some **fundamental aspects of our modeling strategy**. + +## Refactor steps outlined + +We recommend an incremental implementation process that looks something like this: + +1. 👉 Identify **an important output** (a revenue chart on a dashboard for example, and the mart model(s) that supplies this output. +2. 🔍 Examine all the **entities that are components** of this mart (for instance, an orders mart may include customers, shipping, and product data). +3. 🛠️ **Build semantic models and metrics** for all the required components. +4. 👯 Create a **clone of the output** on top of the Semantic Layer. +5. 💻 Audit to **ensure you get accurate outputs**. +6. 💎 Use `mf list dimensions --metrics [metric_name]` to check that your refactoring is increasing dimensionality (flexibility). +7. 👉 Identify **any other outputs** that point to the mart and **move them to the Semantic Layer**. +8. ✌️ Put a **deprecation plan** in place for the mart. + +You would then **continue this process** on other outputs and marts moving down a list of **priorities**. Each model as you go along will be faster and easier as you'll **reuse many of the same components** that will already have been semantically modeled. + +## Let's make a `revenue` metric + +So far we've been working in new pointing at a staging model to simplify things as we build new mental models for MetricFlow. In reality, unless you're implementing MetricFlow in a green-field dbt project, you probably are going to have some refactoring to do. So let's get into that in detail. + +1. 📚 Per the above steps, we've identified our target, now we need to identify all the components we need, these will be all the 'import' CTEs at the top our mart. Let's look at `orders` and `order_items`, the likely models to generate revenue, we see we'll need: `orders`, `order_items`, `products`, `locations`, and `supplies`. +2. 🗺️ We'll next make semantic models for all of these. Let's walk through a straightforward conversion first with `locations`. +3. ⛓️ We'll want to first decide if we need to do any joining to get this into the shape we want for our semantic model. The biggest determinants of this are two factors: + - 📏 Does this semantic model **contain measures**? + - 🕥 Does this semantic model have a **primary timestamp**? + - 🫂 If a semantic model **has measures but no timestamp** (for example, supplies in the example project, which has static costs of supplies), you'll likely want to **sacrifice some normalization and join it on to another model** that has a primary timestamp to allow for metric aggregation. +4. 🔄 If we _don't_ need any joins, we'll just go straight to the staging model for our semantic model's `ref`. Locations does have a `tax_rate` measure, but it also has an `ordered_at` timestamp, so we can go **straight to the staging model** here. +5. 🥇 We specify our **primary entity** (based on `location_id`), dimensions (one categorical, `location_name`, and one **primary time dimension** `opened_at`), and lastly our measures, in this case just `average_tax_rate`. + + ```YAML + semantic_models: + - name: locations + description: | + Location dimension table. The grain of the table is one row per location. + model: ref('stg_locations') + entities: + - name: location + type: primary + expr: location_id + dimensions: + - name: location_name + type: categorical + - name: date_trunc('day', opened_at) + type: time + type_params: + time_granularity: day + measures: + - name: average_tax_rate + description: Average tax rate. + expr: tax_rate + agg: avg + ``` + +## Semantic and logical interaction + +Now, let's tackle a thornier situation. Products and supplies both have dimensions and measures but no time dimension. Products has a one-to-one relationship with `order_items`, enriching that table, which is itself just a mapping table of products to orders. Additionally, products have a one-to-many relationship with supplies. The high-level ERD looks like the diagram below. + + + +So to calculate, for instance, the cost of ingredients and supplies for a given order, we'll need to do some joining and aggregating, but again we **lack a time dimension for products and supplies**. This is the signal to us that we'll **need to build a logical mart** and point our semantic model at that. + +:::tip +**dbt 🧡 MetricFlow.** This is where integrating your semantic definitions into your dbt project really starts to pay dividends. The interaction between the logical and semantic layers is so dynamic, you either need to house them in one codebase or facilitate a lot of cross-project communication and dependency. +::: + +1. 🎯 Let's aim at, to start, building a table at the `order_items` grain. We can aggregate supply costs up, map over the fields we want from products, such as price, and bring the `ordered_at` timestamp we need over from the orders table. We'll write the following code in `models/marts/order_items.sql`. + + ```SQL + {{ + config( + materialized = 'table', + ) + }} + + with + + order_items as ( + + select * from {{ ref('stg_order_items') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders')}} + + ), + + products as ( + + select * from {{ ref('stg_products') }} + + ), + + supplies as ( + + select * from {{ ref('stg_supplies') }} + + ), + + order_supplies_summary as ( + + select + product_id, + sum(supply_cost) as supply_cost + + from supplies + + group by 1 + ), + + joined as ( + + select + order_items.*, + products.product_price, + order_supplies_summary.supply_cost, + products.is_food_item, + products.is_drink_item, + orders.ordered_at + + from order_items + + left join orders on order_items.order_id = orders.order_id + + left join products on order_items.product_id = products.product_id + + left join order_supplies_summary on order_items.product_id = order_supplies_summary.product_id + + ) + + select * from joined + ``` + +2. 🏗️ Now we've got a table that looks more like what we want to feed into MetricFlow. Next, we'll **build a semantic model on top of this new mart** in `models/marts/order_items.yml`. Again, we'll identify our **entities, then dimensions, then measures**. + + ```YAML + semantic_models: + #The name of the semantic model. + - name: order_items + defaults: + agg_time_dimension: ordered_at + description: | + Items contatined in each order. The grain of the table is one row per order item. + model: ref('order_items') + entities: + - name: order_item + type: primary + expr: order_item_id + - name: order_id + type: foreign + expr: order_id + - name: product + type: foreign + expr: product_id + dimensions: + - name: ordered_at + expr: date_trunc('day', ordered_at) + type: time + type_params: + time_granularity: day + - name: is_food_item + type: categorical + - name: is_drink_item + type: categorical + measures: + - name: revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: product_price + - name: food_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_food_item = 1 then product_price else 0 end + - name: drink_revenue + description: The revenue generated for each order item. Revenue is calculated as a sum of revenue associated with each product in an order. + agg: sum + expr: case when is_drink_item = 1 then product_price else 0 end + - name: median_revenue + description: The median revenue generated for each order item. + agg: median + expr: product_price + ``` + +3. 📏 Finally, Let's **build a simple revenue metric** on top of our semantic model now. + + ```YAML + metrics: + - name: revenue + description: Sum of the product revenue for each order item. Excludes tax. + type: simple + label: Revenue + type_params: + measure: revenue + ``` + +## Checking our work + +- 🔍 We always will start our **auditing** with a `dbt parse && mf validate-configs` to **ensure our code works** before we examine its output. +- 👯 If we're working there, we'll move to trying out an `mf query` that **replicates the logic of the output** we're trying to refactor. +- 💸 For our example we want to **audit monthly revenue**, to do that we'd run the query below. You can [read more about the MetricFlow CLI](https://docs.getdbt.com/docs/build/metricflow-cli). + +### Example query + +```shell +mf query --metrics revenue --group-by metric_time__month +``` + +### Example query results + +```shell +✔ Success 🦄 - query completed after 1.02 seconds +| METRIC_TIME__MONTH | REVENUE | +|:---------------------|----------:| +| 2016-09-01 00:00:00 | 17032.00 | +| 2016-10-01 00:00:00 | 20684.00 | +| 2016-11-01 00:00:00 | 26338.00 | +| 2016-12-01 00:00:00 | 10685.00 | +``` + +- Try introducing some other dimensions from the semantic models into the `group-by` arguments to get a feel for this command. + +## An alternate approach + +If you **don't have capacity to refactor** some of your marts, they can **still benefit from the Semantic Layer**. The above process is about **maximizing dimensionality** for the long term. In the short term, making your **marts as-is available to MetricFlow** unlocks greatly increased functionality. For an example of this quicker approach check out the `customers` SQL and YAML files on the `main` branch. This displays a **typical denormalized dbt mart** being hooked into MetricFlow. diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md new file mode 100644 index 00000000000..fe7438b5800 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-6-advanced-metrics.md @@ -0,0 +1,79 @@ +--- +title: "More advanced metrics" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## More advanced metric types + +We're not limited to just passing measures through to our metrics, we can also _combine_ measures to model more advanced metrics. + +- 🍊 **Ratio** metrics are, as the name implies, about **comparing two metrics as a numerator and a denominator** to form a new metric, for instance the percentage of order items that are food items instead of drinks. +- 🧱 **Derived** metrics are when we want to **write an expression** that calculates a metric **using multiple metrics**. A classic example here is our gross profit calculated by subtracting costs from revenue. +- ➕ **Cumulative** metrics calculate all of a **measure over a given window**, such as the past week, or if no window is supplied, the all-time total of that measure. + +## Ratio metrics + +- 🔢 We need to establish one measure that will be our **numerator**, and one that will be our **denominator**. +- 🥪 Let's calculate the **percentage** of our Jaffle Shop revenue that **comes from food items**. +- 💰 We already have our denominator, revenue, but we'll want to **make a new metric for our numerator** called `food_revenue`. + +```YAML + - name: food_revenue + description: The revenue from food in each order. + label: Food Revenue + type: simple + type_params: + measure: revenue + filter: | + {{ Dimension('order__is_food_order') }} = true +``` + +- 📝 Now we can set up our ratio metric. + +```YAML +- name: food_revenue_pct + description: The % of order revenue from food. + label: Food Revenue % + type: ratio + type_params: + numerator: food_revenue + denominator: revenue +``` + +## Derived metrics + +- 🆙 Now let's really have some fun. One of the most important metrics for any business is not just revenue, but _revenue growth_. Let's use a derived metric to build month-over-month revenue. +- ⚙️ A derived metric has a couple key components: + - 📚 A list of metrics to build on. These can be manipulated and filtered in various way, here we'll use the `offset_window` property to lag by a month. + - 🧮 An expression that performs a calculation with these metrics. +- With these parts we can assemble complex logic that would otherwise need to be 'frozen' in logical models. + +```YAML +- name: revenue_growth_mom + description: "Percentage growth of revenue compared to 1 month ago. Excluded tax" + type: derived + label: Revenue Growth % M/M + type_params: + expr: (current_revenue - revenue_prev_month) * 100 / revenue_prev_month + metrics: + - name: revenue + alias: current_revenue + - name: revenue + offset_window: 1 month + alias: revenue_prev_month +``` + +## Cumulative metrics + +- ➕ Lastly, lets build a **cumulative metric**. In keeping with our theme of business priorities, let's continue with revenue and build an **all-time revenue metric** for any given time window. +- 🪟 All we need to do is indicate the type is `cumulative` and not supply a `window` in the `type_params`, which indicates we want cumulative for the entire time period our end users select. + +```YAML +- name: cumulative_revenue + description: The cumulative revenue for all orders. + label: Cumulative Revenue (All Time) + type: cumulative + type_params: + measure: revenue +``` diff --git a/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md new file mode 100644 index 00000000000..a1062721177 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-build-our-metrics/semantic-layer-7-conclusion.md @@ -0,0 +1,34 @@ +--- +title: "Best practices" +description: Getting started with the dbt and MetricFlow +hoverSnippet: Learn how to get started with the dbt and MetricFlow +--- + +## Putting it all together + +- 📊 We've **created semantic models and metrics** for basic coverage of a key business area. +- 🔁 In doing so we've **refactored a 'static' mart** into a dynamic, flexible new life in the Semantic Layer. +- 🗺️ We encourage you to **explore the `main` branch** of the [example project repo](https://github.com/dbt-labs/jaffle-sl-template) to see even more metrics and semantic models in action within a project fully ported to the Semantic Layer. + +## Best practices + +- ✅ **Prefer normalization** when possible to allow MetricFlow to denormalize dynamically for end users. +- ✅ Use **marts to denormalize** when needed, for instance grouping tables together into richer components, or getting measures on dimensional tables attached to a table with a time spine. +- ✅ When source data is **well normalized** you can **build semantic models on top of staging models**. +- ✅ **Prefer** computing values in **measures and metrics** when possible as opposed to in fixed marts. +- ❌ **Don't directly refactor the code you have in production**, build in parallel so you can audit the Semantic Layer output and deprecate old marts gracefully. + +## Key commands + +- 🔑 Use `dbt parse && mf validate-configs` to generate a semantic manifest and ensure it works with your data. +- 🔑 Use `mf list dimensions --metrics [metric name]` to check that you're increasing dimensionality as you progress. +- 🔑 Use `mf query [query options]` to preview the output from your metrics as you develop. + +## Next steps + +- 🗺️ Map out a clear plan for your dbt project to **incrementally adopt the Semantic Layer**. +- 🤗 Get involved in the community and ask questions, **help craft best practices**, and share your progress in building a dbt Semantic Layer. + +The dbt Semantic Layer is the biggest paradigm shift thus far in the young practice of analytics engineering. It's ready to provide value right away, but is most impactful if you move your project towards increasing normalization, and allow MetricFlow to do the denormalization for you with maximum dimensionality. + +We will be releasing more resources soon covering implementation of the Semantic Layer in dbt Cloud with various integrated BI tools. This is just the beginning, hopefully this guide has given you a path forward for building your data platform in this new era. diff --git a/website/docs/guides/best-practices/how-we-mesh/mesh-1-intro.md b/website/docs/guides/best-practices/how-we-mesh/mesh-1-intro.md new file mode 100644 index 00000000000..ba1660a8d82 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-mesh/mesh-1-intro.md @@ -0,0 +1,39 @@ +--- +title: "Intro to dbt Mesh" +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- + +## What is dbt Mesh? + +Organizations of all sizes rely upon dbt to manage their data transformations, from small startups to large enterprises. At scale, it can be challenging to coordinate all the organizational and technical requirements demanded by your stakeholders within the scope of a single dbt project. To date, there also hasn't been a first-class way to effectively manage the dependencies, governance, and workflows between multiple dbt projects. + +Regardless of your organization's size and complexity, dbt should empower data teams to work independently and collaboratively; sharing data, code, and best practices without sacrificing security or autonomy. dbt Mesh provides the tooling for teams to finally achieve this. + +dbt Mesh is not a single product: it is a pattern enabled by a convergence of several features in dbt: + +- **[Cross-project references](/docs/collaborate/govern/project-dependencies#how-to-use-ref)** - this is the foundational feature that enables the multi-project deployments. `{{ ref() }}`s now work across dbt Cloud projects on Enterprise plans. +- **[dbt Explorer](/docs/collaborate/explore-projects)** - dbt Cloud's metadata-powered documentation platform, complete with full, cross-project lineage. +- **Governance** - dbt's new governance features allow you to manage access to your dbt models both within and across projects. + - **[Groups](/docs/collaborate/govern/model-access#groups)** - groups allow you to assign models to subsets within a project. + - **[Access](/docs/collaborate/govern/model-access#access-modifiers)** - access configs allow you to control who can reference models. +- **[Model Versions](/docs/collaborate/govern/model-versions)** - when coordinating across projects and teams, we recommend treating your data models as stable APIs. Model versioning is the mechanism to allow graceful adoption and deprecation of models as they evolve. +- **[Model Contracts](/docs/collaborate/govern/model-contracts)** - data contracts set explicit expectations on the shape of the data to ensure data changes upstream of dbt or within a project's logic don't break downstream consumers' data products. + +## Who is dbt Mesh for? + +The multi-project architecture helps organizations with mature, complex transformation workflows in dbt increase the flexibility and performance of their dbt projects. If you're already using dbt and your project has started to experience any of the following, you're likely ready to start exploring this paradigm: + +- The **number of models** in your project is degrading performance and slowing down development. +- Teams have developed **separate workflows** and need to decouple development from each other. +- **Security and governance** requirements are increasing and would benefit from increased isolation. + +dbt Cloud is designed to coordinate the features above and simplify the complexity to solve for these problems. + +If you're just starting your dbt journey, don't worry about building a multi-project architecture right away. You can _incrementally_ adopt the features in this guide as you scale. The collection of features work effectively as independent tools. Familiarizing yourself with the tooling and features that make up a multi-project architecture, and how they can apply to your organization will help you make better decisions as you grow. + +## Learning goals + +- Understand the **purpose and tradeoffs** of building a multi-project architecture. +- Develop an intuition for various **dbt Mesh patterns** and how to design a multi-project architecture for your organization. +- Establish recommended steps to **incrementally adopt** these patterns in your dbt implementation. diff --git a/website/docs/guides/best-practices/how-we-mesh/mesh-2-structures.md b/website/docs/guides/best-practices/how-we-mesh/mesh-2-structures.md new file mode 100644 index 00000000000..937515954af --- /dev/null +++ b/website/docs/guides/best-practices/how-we-mesh/mesh-2-structures.md @@ -0,0 +1,52 @@ +--- +title: Deciding how to structure your dbt Mesh +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- +## Exploring mesh patterns + +When adopting a multi-project architecture, where do you draw the lines between projects? + +How should you organize data workflows in a world where instead of having a single dbt DAG, you have multiple projects speaking to each other, each comprised of their own DAG? + +Adopting the dbt Mesh pattern is not a one-size-fits-all process. In fact, it's the opposite! It's about customizing your project structure to fit _your_ team and _your_ data. Now you can mold your organizational knowledge graph to your organizational people graph, bringing people and data closer together rather than compromising one for the other. + +While there is not a single best way to implement this pattern, there are some common decision points that will be helpful for you to consider. + +At a high level, you’ll need to decide: + +- Where to draw the lines between your dbt Projects -- i.e. how do you determine where to split your DAG and which models go in which project? +- How to manage your code -- do you want multiple dbt Projects living in the same repository (mono-repo) or do you want to have multiple repos with one repo per project? + +## Define your project interfaces by splitting your DAG + +The first (and perhaps most difficult!) decision when migrating to a multi-project architecture is deciding where to draw the line in your DAG to define the interfaces between your projects. Let's explore some language for discussing the design of these patterns. + +### Vertical splits + +Vertical splits separate out layers of transformation in DAG order. Let's look at some examples. + +- **Splitting up staging and mart layers** to create a more tightly-controlled, shared set of components that other projects build on but can't edit. +- **Isolating earlier models for security and governance requirements** to separate out and mask PII data so that downstream consumers can't access it is a common use case for a vertical split. +- **Protecting complex or expensive data** to isolate large or complex models that are expensive to run so that they are safe from accidental selection, independently deployable, and easier to debug when they have issues. + +### Horizontal splits + +Horizontal splits separate your DAG based on source or domain. These splits are often based around the shape and size of the data and how it's used. Let's consider some possibilities for horizontal splitting. + +- **Team consumption patterns.** For example, splitting out the marketing team's data flow into a separate project. +- **Data from different sources.** For example, clickstream event data and transactional ecommerce data may need to be modeled independently of each other. +- **Team workflows.** For example, if two embedded groups operate at different paces, you may want to split the projects up so they can move independently. + +### Combining these strategies + +- **These are not either/or techniques**. You should consider both types of splits, and combine them in any way that makes sense for your organization. +- **Pick one type of split and focus on that first**. If you have a hub-and-spoke team topology for example, handle breaking out the central platform project before you split the remainder into domains. Then if you need to break those domains up horizontally you can focus on that after the fact. +- **DRY applies to underlying data, not just code.** Regardless of your strategy, you should not be sourcing the same rows and columns into multiple nodes. When working within a mesh pattern it becomes increasingly important that we don't duplicate logic or data. + +## Determine your git strategy + +A multi-project architecture can exist in a single repo (monorepo) or as multiple projects, with each one being in their own repository (multi-repo). + +- If you're a **smaller team** looking primarily to speed up and simplify development, a **monorepo** is likely the right choice, but can become unwieldy as the number of projects, models and contributors grow. +- If you’re a **larger team with multiple groups**, and need to decouple projects for security and enablement of different development styles and rhythms, a **multi-repo setup** is your best bet. diff --git a/website/docs/guides/best-practices/how-we-mesh/mesh-3-implementation.md b/website/docs/guides/best-practices/how-we-mesh/mesh-3-implementation.md new file mode 100644 index 00000000000..cfbbc7a1f28 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-mesh/mesh-3-implementation.md @@ -0,0 +1,130 @@ +--- +title: "Implementing your mesh plan" +description: Getting started with dbt Mesh patterns +hoverSnippet: Learn how to get started with dbt Mesh +--- + +As mentioned before, the key decision in migrating to a multi-project architecture is understanding how your project is already being grouped, built, and deployed. We can use this information to inform our decision to split our project apart. + +- **Examine your jobs** - which sets of models are most often built together? +- **Look at your lineage graph** - how are models connected? +- **Look at your selectors** defined in `selectors.yml` - how do people already define resource groups? +- **Talk to teams** about what sort of separation naturally exists right now. + - Are there various domains people are focused on? + - Are there various sizes, shapes, and sources of data that get handled separately (such as click event data)? + - Are there people focused on separate levels of transformation, such as landing and staging data or building marts? + +## Add groups and access + +Once you have a sense of some initial groupings, you can first implement **group and access permissions** within a single project. + +- First you can create a [group](/docs/build/groups) to define the owner of a set of models. + +```yml +# in models/__groups.yml + +groups: + - name: marketing + owner: + - name: Ben Jaffleck + email: ben.jaffleck@jaffleshop.com +``` + +- Then, we can add models to that group using the `group:` key in the model's YAML entry. + +```yml +# in models/marketing/__models.yml + +models: + - name: fct_marketing_model + group: marketing + - name: stg_marketing_model + group: marketing +``` + +- Once you've added models to the group, you can **add [access](/docs/collaborate/govern/model-access) settings to the models** based on their connections between groups, *opting for the most private access that will maintain current functionality*. This means that any model that has *only* relationships to other models in the same group should be `private` , and any model that has cross-group relationships, or is a terminal node in the group DAG should be `protected` so that other parts of the DAG can continue to reference it. + +```yml +# in models/marketing/__models.yml + +models: + - name: fct_marketing_model + group: marketing + access: protected + - name: stg_marketing_model + group: marketing + access: private +``` + +- **Validate these groups by incrementally migrating your jobs** to execute these groups specifically via selection syntax. We would recommend doing this in parallel to your production jobs until you’re sure about them. This will help you feel out if you’ve drawn the lines in the right place. +- If you find yourself **consistently making changes across multiple groups** when you update logic, that’s a sign that **you may want to rethink your groups**. + +## Split your projects + +1. **Move your grouped models into a subfolder**. This will include any model in the selected group, it's associated YAML entry, as well as its parent or child resources as appropriate depending on where this group sits in your DAG. + 1. Note that just like in your dbt project, circular refereneces are not allowed! Project B cannot have parents and children in Project A, for example. +2. **Create a new `dbt_project.yml` file** in the subdirectory. +3. **Copy any macros** used by the resources you moved. +4. **Create a new `packages.yml` file** in your subdirectory with the packages that are used by the resources you moved. +5. **Update `{{ ref }}` functions** — For any model that has a cross-project dependency (this may be in the files you moved, or in the files that remain in your project): + 1. Update the `{{ ref() }}` function to have two arguments, where the first is the name of the source project and the second is the name of the model: e.g. `{{ ref('jaffle_shop', 'my_upstream_model') }}` + 2. Update the upstream, cross-project parents’ `access` configs to `public` , ensuring any project can safely `{{ ref() }}` those models. + 3. We *highly* recommend adding a [model contract](/docs/collaborate/govern/model-contracts) to the upstream models to ensure the data shape is consistent and reliable for your downstream consumers. +6. **Create a `dependencies.yml` file** ([docs](/docs/collaborate/govern/project-dependencies)) for the downstream project, declaring the upstream project as a dependency. + +```yml + +# in dependencies.yml +projects: + - name: jaffle_shop +``` + +### Best practices + +- When you’ve **confirmed the right groups**, it's time to split your projects. + - **Do *one* group at a time**! + - **Do *not* refactor as you migrate**, however tempting that may be. Focus on getting 1-to-1 parity and log any issues you find in doing the migration for later. Once you’ve fully migrated the project then you can start optimizing it for its new life as part of your mesh. +- Start by splitting your project within the same repository for full git tracking and easy reversion if you need to start from scratch. + + +## Connecting existing projects + +Some organizations may already be coordinating across multiple dbt projects. Most often this is via: + +1. Installing parent projects as dbt packages +2. Using `{{ source() }}` functions to read the outputs of a parent project as inputs to a child project. + +This has a few drawbacks: + +1. If using packages, each project has to include *all* resources from *all* projects in its manifest, slowing down dbt and the development cycle. +2. If using sources, there are breakages in the lineage, as there's no real connection between the parent and child projects. + +The migration steps here are much simpler than splitting up a monolith! + +1. If using the `package` method: + 1. In the parent project: + 1. mark all models being referenced downstream as `public` and add a model contract. + 2. In the child project: + 1. Remove the package entry from `packages.yml` + 2. Add the upstream project to your `dependencies.yml` + 3. Update the `{{ ref() }}` functions to models from the upstream project to include the project name argument. +1. If using `source` method: + 1. In the parent project: + 1. mark all models being imported downstream as `public` and add a model contract. + 2. In the child project: + 1. Add the upstream project to your `dependencies.yml` + 2. Replace the `{{ source() }}` functions with cross project `{{ ref() }}` functions. + 3. Remove the unnecessary `source` definitions. + +## Additional Resources +### Our example projects + +We've provided a set of example projects you can use to explore the topics covered here. We've split our [Jaffle Shop](https://github.com/dbt-labs/jaffle-shop) project into 3 separate projects in a multi-repo dbt Mesh. Note that you'll need to leverage dbt Cloud to use multi-project architecture, as cross-project references are powered via dbt Cloud's APIs. + +- **[Platform](https://github.com/dbt-labs/jaffle-shop-mesh-platform)** - containing our centralized staging models. +- **[Marketing](https://github.com/dbt-labs/jaffle-shop-mesh-marketing)** - containing our marketing marts. +- **[Finance](https://github.com/dbt-labs/jaffle-shop-mesh-finance)** - containing our finance marts. + +### dbt-meshify + +We recommend using the `dbt-meshify` [command line tool]() to help you do this. This comes with CLI operations to automate most of the above steps. diff --git a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md b/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md index 8a5aaa998be..d1e78231e57 100644 --- a/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md +++ b/website/docs/guides/best-practices/how-we-structure/1-guide-overview.md @@ -1,6 +1,9 @@ --- -title: How we structure our dbt projects +title: "How we structure our dbt projects" id: 1-guide-overview +description: Learn how we structure our dbt projects. +displayText: How we structure our dbt projects +hoverSnippet: Learn how we structure our dbt projects. --- ## Why does structure matter? @@ -9,11 +12,11 @@ Analytics engineering, at its core, is about helping groups of human beings coll Building a great dbt project is an inherently collaborative endeavor, bringing together domain knowledge from every department to map the goals and narratives of the entire company. As such, it's especially important to establish a deep and broad set of patterns to ensure as many people as possible are empowered to leverage their particular expertise in a positive way, and to ensure that the project remains approachable and maintainable as your organization scales. -Famously, Steve Jobs [wore the same outfit everyday](https://images.squarespace-cdn.com/content/v1/5453c539e4b02ab5398ffc8f/1580381503218-E56FQDNFL1P4OBLQWHWW/ke17ZwdGBToddI8pDm48kJKedFpub2aPqa33K4gNUDwUqsxRUqqbr1mOJYKfIPR7LoDQ9mXPOjoJoqy81S2I8N_N4V1vUb5AoIIIbLZhVYxCRW4BPu10St3TBAUQYVKcxb5ZTIyC_D49_DDQq2Sj8YVGtM7O1i4h5tvKa2lazN4nGUQWMS_WcPM-ztWbVr-c/steve_jobs_outfit.jpg) to reduce decision fatigue. You can think of this guide similarly, as a black turtleneck and New Balance sneakers for your company’s dbt project. A dbt project’s power outfit, or more accurately its structure, is composed not of fabric but of files, folders, naming conventions, and programming patterns. How you label things, group them, split them up, or bring them together — the system you use to organize the data transformations encoded in your dbt project — this is your project’s structure. +Famously, Steve Jobs [wore the same outfit everyday](https://images.squarespace-cdn.com/content/v1/5453c539e4b02ab5398ffc8f/1580381503218-E56FQDNFL1P4OBLQWHWW/ke17ZwdGBToddI8pDm48kJKedFpub2aPqa33K4gNUDwUqsxRUqqbr1mOJYKfIPR7LoDQ9mXPOjoJoqy81S2I8N_N4V1vUb5AoIIIbLZhVYxCRW4BPu10St3TBAUQYVKcxb5ZTIyC_D49_DDQq2Sj8YVGtM7O1i4h5tvKa2lazN4nGUQWMS_WcPM-ztWbVr-c/steve_jobs_outfit.jpg) to reduce decision fatigue. You can think of this guide similarly, as a black turtleneck and New Balance sneakers for your company’s dbt project. A dbt project’s power outfit, or more accurately its structure, is composed not of fabric but of files, folders, naming conventions, and programming patterns. How you label things, group them, split them up, or bring them together — the system you use to organize the [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) encoded in your dbt project — this is your project’s structure. -This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all *stay consistent*. +This guide is just a starting point. You may decide that you prefer Birkenstocks or a purple hoodie for your project over Jobs-ian minimalism. That's fine. What's important is that you think through the reasoning for those changes in your organization, explicitly declare them in a thorough, accessible way for all contributors, and above all _stay consistent_. -One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from *source-conformed* to *business-conformed*. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. +One foundational principle that applies to all dbt projects though, is the need to establish a cohesive arc moving data from _source-conformed_ to _business-conformed_. Source-conformed data is shaped by external systems out of our control, while business-conformed data is shaped by the needs, concepts, and definitions we create. No matter what patterns or conventions you define within your project, this process remains the essential purpose of the transformation layer, and dbt as your tool within it. This guide is an update to a seminal analytics engineering [post of the same name](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) by the great Claire Carroll, and while some of the details have changed over time (as anticipated in that post) this fundamental trajectory holds true. Moving forward, this guide will be iteratively updated as new tools expand our viewpoints, new experiences sharpen our vision, and new voices strengthen our perspectives, but always in service of that aim. ### Learning goals @@ -21,7 +24,7 @@ This guide has three main goals: - Thoroughly cover our most up-to-date recommendations on how to structure typical dbt projects - Illustrate these recommendations with comprehensive examples -- At each stage, explain *why* we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs +- At each stage, explain _why_ we recommend the approach that we do, so that you're equipped to decide when and where to deviate from these recommendations to better fit your organization’s unique needs You should walk away from this guide with a deeper mental model of how the components of a dbt project fit together, such that purpose and principles of analytics engineering feel more clear and intuitive. @@ -30,7 +33,7 @@ By approaching our structure intentionally, we’ll gain a better understanding Our hope is that by deepening your sense of the connections between these patterns and the principles they flow from, you'll be able to translate them to fit your specific needs and craft customized documentation for your team to act on. :::info Example project. -This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they *are* a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. +This guide walks through our recommendations using a very simple dbt project — similar to the one used for the Getting Started guide and many other demos — from a fictional company called the Jaffle Shop. You can read more about [jaffles](https://en.wiktionary.org/wiki/jaffle) if you want (they _are_ a real thing), but that context isn’t important to understand the structure. We encourage you to follow along, try things out, make changes, and take notes on what works or doesn't work for you along the way. ::: We'll get a deeper sense of our project as we move through the guide, but for now we just need to know that the Jaffle Shop is a restaurant selling jaffles that has two main data sources: @@ -43,21 +46,21 @@ We'll get a deeper sense of our project as we move through the guide, but for no We'll walk through our topics in the same order that our data would move through transformation: 1. Dig into how we structure the files, folders, and models for our three primary layers in the `models` directory, which build on each other: - 1. **Staging** — creating our atoms, our initial modular building blocks, from source data - 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want - 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about + 1. **Staging** — creating our atoms, our initial modular building blocks, from source data + 2. **Intermediate** — stacking layers of logic with clear and specific purposes to prepare our staging models to join into the entities we want + 3. **Marts** — bringing together our modular pieces into a wide, rich vision of the entities our organization cares about 2. Explore how these layers fit into the rest of the project: - 1. Review the overall structure comprehensively - 2. Expand on YAML configuration in-depth - 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analysis` + 1. Review the overall structure comprehensively + 2. Expand on YAML configuration in-depth + 3. Discuss how to use the other folders in a dbt project: `tests`, `seeds`, and `analyses` Below is the complete file tree of the project we’ll be working through. Don’t worry if this looks like a lot of information to take in at once - this is just to give you the full vision of what we’re building towards. We’ll focus in on each of the sections one by one as we break down the project’s structure. -```markdown +```shell jaffle_shop ├── README.md -├── analysis -├── data +├── analyses +├── seeds │ └── employees.csv ├── dbt_project.yml ├── macros diff --git a/website/docs/guides/best-practices/how-we-structure/2-staging.md b/website/docs/guides/best-practices/how-we-structure/2-staging.md index 225d1a3caf6..bcb589508e5 100644 --- a/website/docs/guides/best-practices/how-we-structure/2-staging.md +++ b/website/docs/guides/best-practices/how-we-structure/2-staging.md @@ -1,6 +1,9 @@ --- title: "Staging: Preparing our atomic building blocks" id: 2-staging +description: Preparing our atomic building blocks. +displayText: Preparing our atomic building blocks. +hoverSnippet: Preparing our atomic building blocks. --- The staging layer is where our journey begins. This is the foundation of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project. @@ -11,7 +14,7 @@ We'll use an analogy for working with dbt throughout this guide: thinking modula Let's zoom into the staging directory from our `models` file tree [in the overview](/guides/best-practices/how-we-structure/1-guide-overview) and walk through what's going on here. -```markdown +```shell models/staging ├── jaffle_shop │ ├── _jaffle_shop__docs.md @@ -28,12 +31,12 @@ models/staging └── stg_stripe__payments.sql ``` -- **Folders.** Folder structure is extremely important in dbt. Not only do we need a consistent structure to find our way around the codebase, as with any software project, but our folder structure is also one of the key interfaces into understanding the knowledge graph encoded in our project (alongside the DAG and the data output into our warehouse). It should reflect how the data flows, step-by-step, from a wide variety of source-conformed models into fewer, richer business-conformed models. Moreover, we can use our folder structure as a means of selection in dbt [selector syntax](https://docs.getdbt.com/reference/node-selection/syntax). For example, with the above structure, if we got fresh Stripe data loaded and wanted to run all the models that build on our Stripe data, we can easily run `dbt build --select staging.stripe+` and we’re all set building more up-to-date reports on payments. +- **Folders.** Folder structure is extremely important in dbt. Not only do we need a consistent structure to find our way around the codebase, as with any software project, but our folder structure is also one of the key interfaces for understanding the knowledge graph encoded in our project (alongside the DAG and the data output into our warehouse). It should reflect how the data flows, step-by-step, from a wide variety of source-conformed models into fewer, richer business-conformed models. Moreover, we can use our folder structure as a means of selection in dbt [selector syntax](https://docs.getdbt.com/reference/node-selection/syntax). For example, with the above structure, if we got fresh Stripe data loaded and wanted to run all the models that build on our Stripe data, we can easily run `dbt build --select staging.stripe+` and we’re all set for building more up-to-date reports on payments. - ✅ **Subdirectories based on the source system**. Our internal transactional database is one system, the data we get from Stripe's API is another, and lastly the events from our Snowplow instrumentation. We've found this to be the best grouping for most companies, as source systems tend to share similar loading methods and properties between tables, and this allows us to operate on those similar sets easily. - ❌ **Subdirectories based on loader.** Some people attempt to group by how the data is loaded (Fivetran, Stitch, custom syncs), but this is too broad to be useful on a project of any real size. - - ❌ **Subdirectories based on business grouping.** Another approach we recommend against is splitting up by business groupings in the staging layer, and creating subdirectories like 'marketing', 'finance', etc. A key goal of any great dbt project should be establishing a single source of truth. By breaking things up too early, we open ourselves up to create overlap and conflicting definitions (think marketing and financing having different fundamental tables for orders). We want everybody to be building with the same set of atoms, so in our experience, starting our transformations with our staging structure reflecting the source system structures is the best level of grouping for this step. + - ❌ **Subdirectories based on business grouping.** Another approach we recommend against is splitting up by business groupings in the staging layer, and creating subdirectories like 'marketing', 'finance', etc. A key goal of any great dbt project should be establishing a single source of truth. By breaking things up too early, we open ourselves up to creating overlap and conflicting definitions (think marketing and financing having different fundamental tables for orders). We want everybody to be building with the same set of atoms, so in our experience, starting our transformations with our staging structure reflecting the source system structures is the best level of grouping for this step. - **File names.** Creating a consistent pattern of file naming is [crucial in dbt](https://docs.getdbt.com/blog/on-the-importance-of-naming). File names must be unique and correspond to the name of the model when selected and created in the warehouse. We recommend putting as much clear information into the file name as possible, including a prefix for the layer the model exists in, important grouping information, and specific information about the entity or transformation in the model. - - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. + - ✅ `stg_[source]__[entity]s.sql` - the double underscore between source system and entity helps visually distinguish the separate parts in the case of a source name having multiple words. For instance, `google_analytics__campaigns` is always understandable, whereas to somebody unfamiliar `google_analytics_campaigns` could be `analytics_campaigns` from the `google` source system as easily as `campaigns` from the `google_analytics` source system. Think of it like an [oxford comma](https://www.youtube.com/watch?v=P_i1xk07o4g), the extra clarity is very much worth the extra punctuation. - ❌ `stg_[entity].sql` - might be specific enough at first, but will break down in time. Adding the source system into the file name aids in discoverability, and allows understanding where a component model came from even if you aren't looking at the file tree. - ✅ **Plural.** SQL, and particularly SQL in dbt, should read as much like prose as we can achieve. We want to lean into the broad clarity and declarative nature of SQL when possible. As such, unless there’s a single order in your `orders` table, plural is the correct way to describe what is in a table with multiple rows. @@ -41,7 +44,7 @@ models/staging Now that we’ve got a feel for how the files and folders fit together, let’s look inside one of these files and dig into what makes for a well-structured staging model. -Below, is an example of a standard staging model (from our `stg_stripe__payments` model) that illustrates the common patterns within the staging layer. We’ve organized our model into two CTEs: one pulling in a source table via the [source macro](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources#selecting-from-a-source) and the other applying our transformations. +Below, is an example of a standard staging model (from our `stg_stripe__payments` model) that illustrates the common patterns within the staging layer. We’ve organized our model into two CTEs: one pulling in a source table via the [source macro](https://docs.getdbt.com/docs/build/sources#selecting-from-a-source) and the other applying our transformations. While our later layers of transformation will vary greatly from model to model, every one of our staging models will follow this exact same pattern. As such, we need to make sure the pattern we’ve established is rock solid and consistent. @@ -74,7 +77,7 @@ renamed as ( -- numerics amount as amount_cents, amount / 100.0 as amount, - + -- booleans case when status = 'successful' then true @@ -99,22 +102,23 @@ select * from renamed - ✅ **Type casting** - ✅ **Basic computations** (e.g. cents to dollars) - ✅ **Categorizing** (using conditional logic to group values into buckets or booleans, such as in the `case when` statements above) - - ❌ **Joins** — the goal of staging models is to clean and prepare individual source conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (see [base models](guides/best-practices/how-we-structure/2-staging#staging-other-considerations) below). + - ❌ **Joins** — the goal of staging models is to clean and prepare individual source-conformed concepts for downstream usage. We're creating the most useful version of a source system table, which we can use as a new modular component for our project. In our experience, joins are almost always a bad idea here — they create immediate duplicated computation and confusing relationships that ripple downstream — there are occasionally exceptions though (refer to [base models](#staging-other-considerations) for more info). - ❌ **Aggregations** — aggregations entail grouping, and we're not doing that at this stage. Remember - staging models are your place to create the building blocks you’ll use all throughout the rest of your project — if we start changing the grain of our tables by grouping in this layer, we’ll lose access to source data that we’ll likely need at some point. We just want to get our individual concepts cleaned and ready for use, and will handle aggregating values downstream. -- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: +- ✅ **Materialized as views.** Looking at a partial view of our `dbt_project.yml` below, we can see that we’ve configured the entire staging directory to be materialized as views. As they’re not intended to be final artifacts themselves, but rather building blocks for later models, staging models should typically be materialized as views for two key reasons: + - Any downstream model (discussed more in [marts](/guides/best-practices/how-we-structure/4-marts)) referencing our staging models will always get the freshest data possible from all of the component views it’s pulling together and materializing - It avoids wasting space in the warehouse on models that are not intended to be queried by data consumers, and thus do not need to perform as quickly or efficiently ```yaml # dbt_project.yml - + models: jaffle_shop: staging: +materialized: view ``` -- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — *staging* it — for use downstream. +- Staging models are the only place we'll use the [`source` macro](/docs/build/sources), and our staging models should have a 1-to-1 relationship to our source tables. That means for each source system table we’ll have a single staging model referencing it, acting as its entry point — _staging_ it — for use downstream. :::tip Don’t Repeat Yourself. Staging models help us keep our code DRY. dbt's modular, reusable structure means we can, and should, push any transformations that we’ll always want to use for a given component model as far upstream as possible. This saves us from potentially wasting code, complexity, and compute doing the same transformation more than once. For instance, if we know we always want our monetary values as floats in dollars, but the source system is integers and cents, we want to do the division and type casting as early as possible so that we can reference it rather than redo it repeatedly downstream. @@ -125,94 +129,96 @@ This is a welcome change for many of us who have become used to applying the sam ### Staging: Other considerations - **Base models when joins are necessary to stage concepts.** Sometimes, in order to maintain a clean and DRY staging layer we do need to implement some joins to create a solid concept for our building blocks. In these cases, we recommend creating a sub-directory in the staging directory for the source system in question and building `base` models. These have all the same properties that would normally be in the staging layer, they will directly source the raw data and do the non-joining transformations, then in the staging models we’ll join the requisite base models. The most common use cases for building a base layer under a staging folder are: - - ✅ **Joining in separate delete tables**. Sometime a source system might store deletes in a separate table. Typically we’ll want to make sure we can mark or filter out deleted records for all our component models, so we’ll need to join these delete records up to any of our entities that follow this pattern. This is the example shown below to illustrate. + + - ✅ **Joining in separate delete tables**. Sometimes a source system might store deletes in a separate table. Typically we’ll want to make sure we can mark or filter out deleted records for all our component models, so we’ll need to join these delete records up to any of our entities that follow this pattern. This is the example shown below to illustrate. ```sql -- base_jaffle_shop__customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customers') }} - + ), - + customers as ( - + select id as customer_id, first_name, last_name - + from source - + ) - + select * from customers ``` ```sql -- base_jaffle_shop__deleted_customers.sql - + with - + source as ( - + select * from {{ source('jaffle_shop','customer_deletes') }} - + ), - + deleted_customers as ( - + select id as customer_id, deleted as deleted_at - + from source - + ) - + select * from deleted_customers ``` ```sql -- stg_jaffle_shop__customers.sql - + with - + customers as ( - + select * from {{ ref('base_jaffle_shop__customers') }} - + ), - + deleted_customers as ( - + select * from {{ ref('base_jaffle_shop__deleted_customers') }} - + ), - + join_and_mark_deleted_customers as ( - + select customers.*, case when deleted_customers.deleted_at is not null then true else false end as is_deleted - + from customers - + left join deleted_customers on customers.customer_id = deleted_customers.customer_id - + ) - + select * from join_and_mark_deleted_customers ``` - - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if *all* of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). -- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. + - ✅ **Unioning disparate but symmetrical sources**. A typical example here would be if you operate multiple ecommerce platforms in various territories via a SaaS platform like Shopify. You would have perfectly identical schemas, but all loaded separately into your warehouse. In this case, it’s easier to reason about our orders if _all_ of our shops are unioned together, so we’d want to handle the unioning in a base model before we carry on with our usual staging model transformations on the (now complete) set — you can dig into [more detail on this use case here](https://discourse.getdbt.com/t/unioning-identically-structured-data-sources/921). + +- **[Codegen](https://github.com/dbt-labs/dbt-codegen) to automate staging table generation.** It’s very good practice to learn to write staging models by hand, they’re straightforward and numerous, so they can be an excellent way to absorb the dbt style of writing SQL. Also, we’ll invariably find ourselves needing to add special elements to specific models at times — for instance, in one of the situations above that require base models — so it’s helpful to deeply understand how they work. Once that understanding is established though, because staging models are built largely following the same rote patterns and need to be built 1-to-1 for each source table in a source system, it’s preferable to start automating their creation. For this, we have the [codegen](https://github.com/dbt-labs/dbt-codegen) package. This will let you automatically generate all the source YAML and staging model boilerplate to speed up this step, and we recommend using it in every project. - **Utilities folder.** While this is not in the `staging` folder, it’s useful to consider as part of our fundamental building blocks. The `models/utilities` directory is where we can keep any general purpose models that we generate from macros or based on seeds that provide tools to help us do our modeling, rather than data to model itself. The most common use case is a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source) generated with [the dbt utils package](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/). :::info Development flow versus DAG order. diff --git a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md b/website/docs/guides/best-practices/how-we-structure/3-intermediate.md index a98881fa969..0cf44d3cccc 100644 --- a/website/docs/guides/best-practices/how-we-structure/3-intermediate.md +++ b/website/docs/guides/best-practices/how-we-structure/3-intermediate.md @@ -1,6 +1,9 @@ --- title: "Intermediate: Purpose-built transformation steps" -id: 3-intermediate +id: "3-intermediate" +description: Purpose-built transformation steps. +displayText: Purpose-built transformation steps. +hoverSnippet: Purpose-built transformation steps. --- Once we’ve got our atoms ready to work with, we’ll set about bringing them together into more intricate, connected molecular shapes. The intermediate layer is where these molecules live, creating varied forms with specific purposes on the way towards the more complex proteins and cells we’ll use to breathe life into our data products. @@ -9,7 +12,7 @@ Once we’ve got our atoms ready to work with, we’ll set about bringing them t Let’s take a look at the intermediate layer of our project to understand the purpose of this stage more concretely. -```markdown +```shell models/intermediate └── finance ├── _int_finance__models.yml @@ -17,43 +20,43 @@ models/intermediate ``` - **Folders** - - ✅ **Subdirectories based on business groupings.** Much like the staging layer, we’ll house this layer of models inside their own `intermediate` subfolder. Unlike the staging layer, here we shift towards being business-conformed, splitting our models up into subdirectories not by their source system, but their area of business concern. + - ✅ **Subdirectories based on business groupings.** Much like the staging layer, we’ll house this layer of models inside their own `intermediate` subfolder. Unlike the staging layer, here we shift towards being business-conformed, splitting our models up into subdirectories not by their source system, but by their area of business concern. - **File names** - - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer make it harder to dictate strictly how to name them. The best guiding principle is to think about *verbs* (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know SQL. That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience there are often intrinsic connection between entities and verbs in this layer that make that difficult to maintain. + - `✅ int_[entity]s_[verb]s.sql` - the variety of transformations that can happen inside of the intermediate layer makes it harder to dictate strictly how to name them. The best guiding principle is to think about _verbs_ (e.g. `pivoted`, `aggregated_to_user`, `joined`, `fanned_out_by_quantity`, `funnel_created`, etc.) in the intermediate layer. In our example project, we use an intermediate model to pivot payments out to the order grain, so we name our model `int_payments_pivoted_to_orders`. It’s easy for anybody to quickly understand what’s happening in that model, even if they don’t know [SQL](https://mode.com/sql-tutorial/). That clarity is worth the long file name. It’s important to note that we’ve dropped the double underscores at this layer. In moving towards business-conformed concepts, we no longer need to separate a system and an entity and simply reference the unified entity if possible. In cases where you need intermediate models to operate at the source system level (e.g. `int_shopify__orders_summed`, `int_core__orders_summed` which you would later union), you’d preserve the double underscores. Some people like to separate the entity and verbs with double underscores as well. That’s a matter of preference, but in our experience, there is often an intrinsic connection between entities and verbs in this layer that make that difficult to maintain. :::tip Don’t over-optimize too early! -The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with this few models. Remember, our goal is a *single* *source of truth.* We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. +The example project is very simple for illustrative purposes. This level of division in our post-staging layers is probably unnecessary when dealing with these few models. Remember, our goal is a _single_ _source of truth._ We don’t want finance and marketing operating on separate `orders` models, we want to use our dbt project as a means to bring those definitions together! As such, don’t split and optimize too early. If you have less than 10 marts models and aren’t having problems developing and using them, feel free to forego subdirectories completely (except in the staging layer, where you should always implement them as you add new source systems to your project) until the project has grown to really need them. Using dbt is always about bringing simplicity to complexity. ::: ### Intermediate: Models -Below is the lone intermediate model from our small example project. This represents an excellent use case per our principles above, serving a clear single purpose: grouping and pivoting a staging model to different grain. It utilizes a bit of Jinja to make the model DRY-er (striving to be DRY applies to the code we write inside a single model in addition to transformations across the codebase), but don’t be intimidated if you’re not quite comfortable with [Jinja](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) yet. Looking at the name of the CTE, `pivot_and_aggregate_payments_to_order_grain` we get a very clear idea of what’s happening inside this block. By descriptively labeling the transformations happening inside our CTEs within model, just as we do with our files and folders, even a stakeholder who doesn’t know SQL would be able to grasp the purpose of this section, if not the code. As you begin to write more complex transformations moving out of the staging layer, keep this idea in mind. In the same way our models connect into a DAG and tell the story of our transformations on a macro scale, CTEs can do this on a smaller scale inside our model files. +Below is the lone intermediate model from our small example project. This represents an excellent use case per our principles above, serving a clear single purpose: grouping and pivoting a staging model to different grain. It utilizes a bit of Jinja to make the model DRY-er (striving to be DRY applies to the code we write inside a single model in addition to transformations across the codebase), but don’t be intimidated if you’re not quite comfortable with [Jinja](/docs/build/jinja-macros) yet. Looking at the name of the CTE, `pivot_and_aggregate_payments_to_order_grain` we get a very clear idea of what’s happening inside this block. By descriptively labeling the transformations happening inside our CTEs within model, just as we do with our files and folders, even a stakeholder who doesn’t know SQL would be able to grasp the purpose of this section, if not the code. As you begin to write more complex transformations moving out of the staging layer, keep this idea in mind. In the same way our models connect into a DAG and tell the story of our transformations on a macro scale, CTEs can do this on a smaller scale inside our model files. ```sql -- int_payments_pivoted_to_orders.sql {%- set payment_methods = ['bank_transfer','credit_card','coupon','gift_card'] -%} - -with + +with payments as ( select * from {{ ref('stg_stripe__payments') }} ), - + pivot_and_aggregate_payments_to_order_grain as ( - + select - order_id, + order_id, {% for payment_method in payment_methods -%} - + sum( case when payment_method = '{{ payment_method }}' and - status = 'success' - then amount - else 0 + status = 'success' + then amount + else 0 end ) as {{ payment_method }}_amount, @@ -65,7 +68,7 @@ pivot_and_aggregate_payments_to_order_grain as ( group by 1 ) - + select * from pivot_and_aggregate_payments_to_order_grain ``` @@ -74,26 +77,15 @@ select * from pivot_and_aggregate_payments_to_order_grain - ✅ **Materialized as views in a custom schema with special permissions.** A more robust option is to materialize your intermediate models as views in a specific [custom schema](/docs/build/custom-schemas), outside of your main production schema. This gives you added insight into development and easier troubleshooting as the number and complexity of your models grows, while remaining easy to implement and taking up negligible space. :::tip Keep your warehouse tidy! -There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as *part of the UX,* in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieve this. +There are three interfaces to the organizational knowledge graph we’re encoding into dbt: the DAG, the files and folder structure of our codebase, and the output into the warehouse. As such, it’s really important that we consider that output intentionally! Think of the schemas, tables, and views we’re creating in the warehouse as _part of the UX,_ in addition to the dashboards, ML, apps, and other use cases you may be targeting for the data. Ensuring that our output is named and grouped well, and that models not intended for broad use are either not materialized or built into special areas with specific permissions is crucial to achieving this. ::: -- Intermediate models’ purposes, as these serve to break up complexity from our marts models, can take as many forms as data transformation might require. Some of the most common use cases of intermediate models include: - +- Intermediate models’ purposes, as these serve to break up complexity from our marts models, can take as many forms as [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) might require. Some of the most common use cases of intermediate models include: + - ✅ **Structural simplification.** Bringing together a reasonable number (typically 4 to 6) of entities or concepts (staging models, or perhaps other intermediate models) that will be joined with another similarly purposed intermediate model to generate a mart — rather than have 10 joins in our mart, we can join two intermediate models that each house a piece of the complexity, giving us increased readability, flexibility, testing surface area, and insight into our components. - - ✅ **Re-graining.** Intermediate models are often used to fan out or collapse models to the right composite grain — if we’re building a mart for `order_items` that requires us to fan out our `orders` based on the `quantity` column, creating a new single row for each item, this would be ideal to do in a specific intermediate model to maintain clarity in our mart and more easily view that our grain is correct before we mix it with other components + - ✅ **Re-graining.** Intermediate models are often used to fan out or collapse models to the right composite grain — if we’re building a mart for `order_items` that requires us to fan out our `orders` based on the `quantity` column, creating a new single row for each item, this would be ideal to do in a specific intermediate model to maintain clarity in our mart and more easily view that our grain is correct before we mix it with other components. - ✅ **Isolating complex operations.** It’s helpful to move any particularly complex or difficult to understand pieces of logic into their own intermediate models. This not only makes them easier to refine and troubleshoot, but simplifies later models that can reference this concept in a more clearly readable way. For example, in the `quantity` fan out example above, we benefit by isolating this complex piece of logic so we can quickly debug and thoroughly test that transformation, and downstream models can reference `order_items` in a way that’s intuitively easy to grasp. - - ❌ **Used repeatedly in multiple models.** If we’re referencing the same intermediate model in several places, it probably should be a macro, or we should reconsider how and where we’re bringing our models together. Remember our DRY principle: while it’s good that we’ve modularized our logic, bringing the same intermediate model into several downstream models usually indicates that we’re creating duplication in our output too early and recomputing the same transformations. Ideally, it’s in the *marts* layer that we’ll start bringing concepts together repeatedly to form a variety of outputs that often have similar data. Be wary of doing this in the intermediate layer, as it can lead to your DAG becoming confusing — generally an indicator that the mental models of our project flow are getting muddied. Additionally, it’s usually more efficient to join marts as they have been materialized into the warehouse, meaning we don’t have to recompute the entire chain of transformations in addition to the computing the joins. - - ![Multiple *outputs* from an intermediate model, bringing the same model into several different marts, is typically a red flag in our DAG that we’re computing the same transformations multiple times.](/img/guides/best-practices/how-we-structure/widening-dag.png) - - Multiple *outputs* from an intermediate model, bringing the same model into several different marts, is typically a red flag in our DAG that we’re computing the same transformations multiple times. - - - ✅ **Used in one model.** Bringing our intermediate models, staging models, and other marts together thoughtfully so our models can build on each other efficiently. - - ![Multiple *inputs* to a model is generally preferable. Several inputs and one output at each node indicates we’re bringing our concepts together into richer, wider entities; forming a healthy DAG shaped like an arrowhead pointing to the right.](/img/guides/best-practices/how-we-structure/narrowing-dag.png) - - Multiple *inputs* to a model is generally preferable. Several inputs and one output at each node indicates we’re bringing our concepts together into richer, wider entities; forming a healthy DAG shaped like an arrowhead pointing to the right. :::tip Narrow the DAG, widen the tables. -Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple *inputs* to a model, but **not** multiple *outputs*. Several arrows going *into* our post-staging models is great and expected, several arrows coming *out* is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. +Until we get to the marts layer and start building our various outputs, we ideally want our DAG to look like an arrowhead pointed right. As we move from source-conformed to business-conformed, we’re also moving from numerous, narrow, isolated concepts to fewer, wider, joined concepts. We’re bringing our components together into wider, richer concepts, and that creates this shape in our DAG. This way when we get to the marts layer we have a robust set of components that can quickly and easily be put into any configuration to answer a variety of questions and serve specific needs. One rule of thumb to ensure you’re following this pattern on an individual model level is allowing multiple _inputs_ to a model, but **not** multiple _outputs_. Several arrows going _into_ our post-staging models is great and expected, several arrows coming _out_ is a red flag. There are absolutely situations where you need to break this rule, but it’s something to be aware of, careful about, and avoid when possible. ::: diff --git a/website/docs/guides/best-practices/how-we-structure/4-marts.md b/website/docs/guides/best-practices/how-we-structure/4-marts.md index 3cf4bc78209..e7a0d35c342 100644 --- a/website/docs/guides/best-practices/how-we-structure/4-marts.md +++ b/website/docs/guides/best-practices/how-we-structure/4-marts.md @@ -1,15 +1,19 @@ --- title: "Marts: Business-defined entities" -id: 4-marts +id: "4-marts" --- -This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the *entity* *layer* or *concept layer*, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. +:::info +Our guidance here diverges if you use the dbt Semantic Layer. In a project without the Semantic Layer we recommend you denormalize heavily, per the best practices below. On the other hand, if you're using the Semantic Layer, we want to stay as normalized as possible to allow MetricFlow the most flexibility. Guidance for marts in a Semantic Layer context is on the next page. +::: + +This is the layer where everything comes together and we start to arrange all of our atoms (staging models) and molecules (intermediate models) into full-fledged cells that have identity and purpose. We sometimes like to call this the _entity_ _layer_ or _concept layer_, to emphasize that all our marts are meant to represent a specific entity or concept at its unique grain. For instance, an order, a customer, a territory, a click event, a payment — each of these would be represented with a distinct mart, and each row would represent a discrete instance of these concepts. Unlike in a traditional Kimball star schema though, in modern data warehousing — where storage is cheap and compute is expensive — we’ll happily borrow and add any and all data from other concepts that are relevant to answering questions about the mart’s core entity. Building the same data in multiple places, as we do with `orders` in our `customers` mart example below, is more efficient in this paradigm than repeatedly rejoining these concepts (this is a basic definition of denormalization in this context). Let’s take a look at how we approach this first layer intended expressly for exposure to end users. ### Marts: Files and folders The last layer of our core transformations is below, providing models for both `finance` and `marketing` departments. -```markdown +```shell models/marts ├── finance │ ├── _finance__models.yml @@ -24,7 +28,7 @@ models/marts ✅ **Name by entity.** Use plain English to name the file based on the concept that forms the grain of the mart `customers`, `orders`. Note that for pure marts, there should not be a time dimension (`orders_per_day`) here, that is typically best captured via metrics. -❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as *separate* concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. +❌ **Build the same concept differently for different teams.** `finance_orders` and `marketing_orders` is typically considered an anti-pattern. There are, as always, exceptions — a common pattern we see is that, finance may have specific needs, for example reporting revenue to the government in a way that diverges from how the company as a whole measures revenue day-to-day. Just make sure that these are clearly designed and understandable as _separate_ concepts, not departmental views on the same concept: `tax_revenue` and `revenue` not `finance_revenue` and `marketing_revenue`. ### Marts: Models @@ -33,7 +37,7 @@ Finally we’ll take a look at the best practices for models within the marts di ```sql -- orders.sql -with +with orders as ( @@ -68,7 +72,7 @@ select * from orders_and_payments_joined ```sql -- customers.sql -with +with customers as ( @@ -117,21 +121,15 @@ customers_and_customer_orders_joined as ( select * from customers_and_customer_orders_joined ``` -- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically *query*, build it into a table, and finally once that table takes too long to *build* and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). +- ✅ **Materialized as tables or incremental models.** Once we reach the marts layer, it’s time to start building not just our logic into the warehouse, but the data itself. This gives end users much faster performance for these later models that are actually designed for their use, and saves us costs recomputing these entire chains of models every time somebody refreshes a dashboard or runs a regression in python. A good general rule of thumb regarding materialization is to always start with a view (as it takes up essentially no storage and always gives you up-to-date results), once that view takes too long to practically _query_, build it into a table, and finally once that table takes too long to _build_ and is slowing down your runs, [configure it as an incremental model](https://docs.getdbt.com/docs/build/incremental-models/). As always, start simple and only add complexity as necessary. The models with the most data and compute-intensive transformations should absolutely take advantage of dbt’s excellent incremental materialization options, but rushing to make all your marts models incremental by default will introduce superfluous difficulty. We recommend reading this [classic post from Tristan on the limits of incremental modeling](https://discourse.getdbt.com/t/on-the-limits-of-incrementality/303). - ✅ **Wide and denormalized.** Unlike old school warehousing, in the modern data stack storage is cheap and it’s compute that is expensive and must be prioritized as such, packing these into very wide denormalized concepts that can provide everything somebody needs about a concept as a goal. - ❌ **Too many joins in one mart.** One good rule of thumb when building dbt transformations is to avoid bringing together too many concepts in a single mart. What constitutes ‘too many’ can vary. If you need to bring 8 staging models together with nothing but simple joins, that might be fine. Conversely, if you have 4 concepts you’re weaving together with some complex and computationally heavy window functions, that could be too much. You need to weigh the number of models you’re joining against the complexity of the logic within the mart, and if it’s too much to read through and build a clear mental model of then look to modularize. While this isn’t a hard rule, if you’re bringing together more than 4 or 5 concepts to create your mart, you may benefit from adding some intermediate models for added clarity. Two intermediate models that bring together three concepts each, and a mart that brings together those two intermediate models, will typically result in a much more readable chain of logic than a single mart with six joins. - ✅ **Build on separate marts thoughtfully.** While we strive to preserve a narrowing DAG up to the marts layer, once here things may start to get a little less strict. A common example is passing information between marts at different grains, as we saw above, where we bring our `orders` mart into our `customers` marts to aggregate critical order data into a `customer` grain. Now that we’re really ‘spending’ compute and storage by actually building the data in our outputs, it’s sensible to leverage previously built resources to speed up and save costs on outputs that require similar data, versus recomputing the same views and CTEs from scratch. The right approach here is heavily dependent on your unique DAG, models, and goals — it’s just important to note that using a mart in building another, later mart is okay, but requires careful consideration to avoid wasted resources or circular dependencies. :::tip Marts are entity-grained. -The most important aspect of marts is that they contain all of the useful data about a *particular entity* at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into *metrics*. +The most important aspect of marts is that they contain all of the useful data about a _particular entity_ at a granular level. That doesn’t mean we don’t bring in lots of other entities and concepts, like tons of `user` data into our `orders` mart, we do! It just means that individual `orders` remain the core grain of our table. If we start grouping `users` and `orders` along a [date spine](https://github.com/dbt-labs/dbt-utils#date_spine-source), into something like `user_orders_per_day`, we’re moving past marts into _metrics_. ::: ### Marts: Other considerations -- **Troubleshoot via tables.** While stacking views and ephemeral models up until our marts — only building data into the warehouse at the end of a chain when we have the models we really want end users to work with — is ideal in production, it can present some difficulties in development. Particularly, certain errors may seem to be surfacing in our later models that actually stem from much earlier dependencies in our model chain (ancestor models in our DAG that are built before the model throwing the errors). If you’re having trouble pinning down where or what a database error is telling you, it can be helpful to temporarily build a specific chain of models as tables so that the warehouse will throw the error where it’s actually occurring. -- **After marts: the activation layer.** In the same way that our staging models are building blocks for our marts, that also offer us direct views into specific source data, our marts are building blocks for our final outputs that also offer direct views into specific ideas. You can use marts directly, but they are equally important as components for building models in the *activation layer* after marts. This is a deep and fast-evolving topic, so we’ll cover this in a separate forthcoming guide that dives into: - - Metrics - - Reverse ETL - - Reporting and dashboards - - Data science and ML - - [Exposures](https://docs.getdbt.com/docs/build/exposures) (how we tie our dbt DAG into all of the above) +- **Troubleshoot via tables.** While stacking views and ephemeral models up until our marts — only building data into the warehouse at the end of a chain when we have the models we really want end users to work with — is ideal in production, it can present some difficulties in development. Particularly, certain errors may seem to be surfacing in our later models that actually stem from much earlier dependencies in our model chain (ancestor models in our DAG that are built before the model throws the errors). If you’re having trouble pinning down where or what a database error is telling you, it can be helpful to temporarily build a specific chain of models as tables so that the warehouse will throw the error where it’s actually occurring. diff --git a/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md b/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md new file mode 100644 index 00000000000..adebc4a63c7 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-structure/5-semantic-layer-marts.md @@ -0,0 +1,48 @@ +--- +title: "Marts for the Semantic Layer" +id: "5-semantic-layer-marts" +--- + +The Semantic Layer alters some fundamental principles of how you organize your project. Using dbt without the Semantic Layer necessitates creating the most useful combinations of your building block components into wide, denormalized marts. On the other hand, the Semantic Layer leverages MetricFlow to denormalize every possible combination of components we've encoded dynamically. As such we're better served to bring more normalized models through from the logical layer into the Semantic Layer to maximize flexibility. This section will assume familiarity with the best practices laid out in the [How we build our metrics](https://docs.getdbt.com/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) guide, so check that out first for a more hands-on introduction to the Semantic Layer. + +## Semantic Layer: Files and folders + +- 2️⃣ There are two major factors that alter our recommendations for the Semantic Layer: + - 📝 There is **more YAML** in the form of **semantic models and metrics**. + - ⏫ We may **use a staging model directly** if it forms a complete normalized component, and it will not have a mart at all. +- 💪 This combination means models at **both the staging and marts layer** may participate in the Semantic Layer and use **more powerful, expansive YAML configuration**. +- 🔁 Given this, for projects using the Semantic Layer we recommend a **YAML-file-per-model approach**, as below. + +```shell +models +├── marts +│   ├── customers.sql +│   ├── customers.yml +│   ├── orders.sql +│   └── orders.yml +└── staging + ├── __sources.yml + ├── stg_customers.sql + ├── stg_customers.yml + ├── stg_locations.sql + ├── stg_locations.yml + ├── stg_order_items.sql + ├── stg_order_items.yml + ├── stg_orders.sql + ├── stg_orders.yml + ├── stg_products.sql + ├── stg_products.yml + ├── stg_supplies.sql + └── stg_supplies.yml +``` + +## When to make a mart + +- ❓ If we can go directly to staging models and it's better to serve normalized models to the Semantic Layer, then when, where, and why would we make a mart? + - 🕰️ We have models that have measures but no time dimension to aggregate against. The details of this are laid out in the [Semantic Layer guide](https://docs.getdbt.com/guides/best-practices/how-we-build-our-metrics/semantic-layer-1-intro) but in short, we need a time dimension to aggregate against in MetricFlow. Dimensional tables that + - 🧱 We want to **materialize** our model in various ways. + - 👯 We want to **version** our model. + - 🛒 We have various related models that make more sense as **one wider component**. + - 1️⃣ We have similar models across multiple data sources that make more sense **unioned together**. + - ⌚ We have models in our project we **need to time to refactor** but want to serve up to the Semantic Layer quickly. +- 🌍 Any of the above and more are great reasons to build a mart. Analytics engineering is about **creativity and problem solving**, so these are not prescriptive rules, **there are many reasons to build marts** in any project. The most important takeaway is that you don't **_have to_** if you're using the Semantic Layer. diff --git a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md b/website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md similarity index 84% rename from website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md rename to website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md index ea54462dabb..4082f92b932 100644 --- a/website/docs/guides/best-practices/how-we-structure/5-the-rest-of-the-project.md +++ b/website/docs/guides/best-practices/how-we-structure/6-the-rest-of-the-project.md @@ -1,13 +1,16 @@ --- title: "The rest of the project" -id: 5-the-rest-of-the-project +id: "6-the-rest-of-the-project" +description: The rest of the project. +displayText: The rest of the project. +hoverSnippet: The rest of the project. --- ### Project structure review So far we’ve focused on the `models` folder, the primary directory of our dbt project. Next, we’ll zoom out and look at how the rest of our project files and folders fit in with this structure, starting with how we approach YAML configuration files. -```markdown +```shell models ├── intermediate │ └── finance @@ -44,11 +47,11 @@ models When structuring your YAML configuration files in a dbt project, you want to balance centralization and file size to make specific configs as easy to find as possible. It’s important to note that while the top-level YAML files (`dbt_project.yml`, `packages.yml`) need to be specifically named and in specific locations, the files containing your `sources` and `models` dictionaries can be named, located, and organized however you want. It’s the internal contents that matter here. As such, we’ll lay out our primary recommendation, as well as the pros and cons of a popular alternative. Like many other aspects of structuring your dbt project, what’s most important here is consistency, clear intention, and thorough documentation on how and why you do what you do. - ✅ **Config per folder.** As in the example above, create a `_[directory]__models.yml` per directory in your models folder that configures all the models in that directory. for staging folders, also include a `_[directory]__sources.yml` per directory. - - The leading underscore ensure your YAML files will be sorted to the top of every folder to make them easy to separate from your models. - - YAML files don’t need unique names in the way that SQL model files do, but including the directory (instead of simply `_sources.yml` in each folder), means you can fuzzy find for the right file more quickly. + - The leading underscore ensures your YAML files will be sorted to the top of every folder to make them easy to separate from your models. + - YAML files don’t need unique names in the way that SQL model files do, but including the directory (instead of simply `_sources.yml` in each folder), means you can fuzzy find the right file more quickly. - We’ve recommended several different naming conventions over the years, most recently calling these `schema.yml` files. We’ve simplified to recommend that these simply be labelled based on the YAML dictionary that they contain. - - If you utilize [doc blocks](https://docs.getdbt.com/docs/building-a-dbt-project/documentation#using-docs-blocks) in your project, we recommend following the same pattern, and creating a `_[directory]__docs.md` markdown file per directory containing all your doc blocks for that folder of models. -- ❌ **Config per project.** Some people put *all* of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. + - If you utilize [doc blocks](https://docs.getdbt.com/docs/collaborate/documentation#using-docs-blocks) in your project, we recommend following the same pattern, and creating a `_[directory]__docs.md` markdown file per directory containing all your doc blocks for that folder of models. +- ❌ **Config per project.** Some people put _all_ of their source and model YAML into one file. While you can technically do this, and while it certainly simplifies knowing what file the config you’re looking for will be in (as there is only one file), it makes it much harder to find specific configurations within that file. We recommend balancing those two concerns. - ⚠️ **Config per model.** On the other end of the spectrum, some people prefer to create one YAML file per model. This presents less of an issue than a single monolith file, as you can quickly search for files, know exactly where specific configurations exist, spot models without configs (and thus without tests) by looking at the file tree, and various other advantages. In our opinion, the extra files, tabs, and windows this requires creating, copying from, pasting to, closing, opening, and managing creates a somewhat slower development experience that outweighs the benefits. Defining config per directory is the most balanced approach for most projects, but if you have compelling reasons to use config per model, there are definitely some great projects that follow this paradigm. - ✅ **Cascade configs.** Leverage your `dbt_project.yml` to set default configurations at the directory level. Use the well-organized folder structure we’ve created thus far to define the baseline schemas and materializations, and use dbt’s cascading scope priority to define variations to this. For example, as below, define your marts to be materialized as tables by default, define separate schemas for our separate subfolders, and any models that need to use incremental materialization can be defined at the model level. @@ -70,32 +73,32 @@ models: ``` :::tip Define your defaults. -One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model *requires* a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are *exceptions.* A folder-based selection like **`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. +One of the many benefits this consistent approach to project structure confers to us is this ability to cascade default behavior. Carefully organizing our folders and defining configuration at that level whenever possible frees us from configuring things like schema and materialization in every single model (not very DRY!) — we only need to configure exceptions to our general rules. Tagging is another area this principle comes into play. Many people new to dbt will rely on tags rather than a rigorous folder structure, and quickly find themselves in a place where every model _requires_ a tag. This creates unnecessary complexity. We want to lean on our folders as our primary selectors and grouping mechanism, and use tags to define groups that are _exceptions._ A folder-based selection like \*\*`dbt build --select marts.marketing` is much simpler than trying to tag every marketing-related model, hoping all developers remember to add that tag for new models, and using `dbt build --select tag:marketing`. ::: ### How we use the other folders -```yaml +```shell jaffle_shop -├── analysis -├── data +├── analyses +├── seeds │ └── employees.csv ├── macros │ ├── _macros.yml │ └── cents_to_dollars.sql ├── snapshots └── tests - └── assert_positive_value_for_total_amount.sql +└── assert_positive_value_for_total_amount.sql ``` We’ve focused heavily thus far on the primary area of action in our dbt project, the `models` folder. As you’ve probably observed though, there are several other folders in our project. While these are, by design, very flexible to your needs, we’ll discuss the most common use cases for these other folders to help get you started. - ✅ `seeds` for lookup tables. The most common use case for seeds is loading lookup tables that are helpful for modeling but don’t exist in any source systems — think mapping zip codes to states, or UTM parameters to marketing campaigns. In this example project we have a small seed that maps our employees to their `customer_id`s, so that we can handle their purchases with special logic. - ❌ `seeds` for loading source data. Do not use seeds to load data from a source system into your warehouse. If it exists in a system you have access to, you should be loading it with a proper EL tool into the raw data area of your warehouse. dbt is designed to operate on data in the warehouse, not as a data-loading tool. -- ✅ `analysis` for storing auditing queries. The `analysis` folder lets you store any queries you want to use Jinja with and version control, but not build into models in your warehouse. There are limitless possibilities here, but the most common use case when we set up projects at dbt Labs is to keep queries that leverage the [audit helper](https://github.com/dbt-labs/dbt-audit-helper) package. This package is incredibly useful for finding discrepancies in output when migrating logic from another system into dbt. +- ✅ `analyses` for storing auditing queries. The `analyses` folder lets you store any queries you want to use Jinja with and version control, but not build into models in your warehouse. There are limitless possibilities here, but the most common use case when we set up projects at dbt Labs is to keep queries that leverage the [audit helper](https://github.com/dbt-labs/dbt-audit-helper) package. This package is incredibly useful for finding discrepancies in output when migrating logic from another system into dbt. - ✅ `tests` for testing multiple specific tables simultaneously. As dbt tests have evolved, writing singular tests has become less and less necessary. It's extremely useful for work-shopping test logic, but more often than not you'll find yourself either migrating that logic into your own custom generic tests or discovering a pre-built test that meets your needs from the ever-expanding universe of dbt packages (between the extra tests in [`dbt-utils`](https://github.com/dbt-labs/dbt-utils) and [`dbt-expectations`](https://github.com/calogica/dbt-expectations) almost any situation is covered). One area where singular tests still shine though is flexibly testing things that require a variety of specific models. If you're familiar with the difference between [unit tests](https://en.wikipedia.org/wiki/Unit_testing) [and](https://www.testim.io/blog/unit-test-vs-integration-test/) [integration](https://www.codecademy.com/resources/blog/what-is-integration-testing/) [tests](https://en.wikipedia.org/wiki/Integration_testing) in software engineering, you can think of generic and singular tests in a similar way. If you need to test the results of how several specific models interact or relate to each other, a singular test will likely be the quickest way to nail down your logic. -- ✅ `snapshots` for creating [Type 2 slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) records from [Type 1](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_1:_overwrite) (destructively updated) source data. This is [covered thoroughly in the dbt Docs](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots), unlike these other folders has a more defined purpose, and is out-of-scope for this guide, but mentioned for completion. -- ✅ `macros` for DRY-ing up transformations you find yourself doing repeatedly. Like snapshots, a full dive into macros is out-of-scope for this guide and well [covered elsewhere](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros/), but one important structure-related recommendation is to [write documentation for your macros](https://docs.getdbt.com/faqs/docs/documenting-macros). We recommend creating a `_macros.yml` and documenting the purpose and arguments for your macros once they’re ready for use. +- ✅ `snapshots` for creating [Type 2 slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) records from [Type 1](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_1:_overwrite) (destructively updated) source data. This is [covered thoroughly in the dbt Docs](/docs/build/snapshots), unlike these other folders has a more defined purpose, and is out-of-scope for this guide, but mentioned for completion. +- ✅ `macros` for DRY-ing up transformations you find yourself doing repeatedly. Like snapshots, a full dive into macros is out-of-scope for this guide and well [covered elsewhere](/docs/build/jinja-macros), but one important structure-related recommendation is to [write documentation for your macros](https://docs.getdbt.com/faqs/docs/documenting-macros). We recommend creating a `_macros.yml` and documenting the purpose and arguments for your macros once they’re ready for use. ### Project splitting @@ -103,11 +106,11 @@ One important, growing consideration in the analytics engineering ecosystem is h - ❌ **Business groups or departments.** Conceptual separations within the project are not a good reason to split up your project. Splitting up, for instance, marketing and finance modeling into separate projects will not only add unnecessary complexity, but destroy the unifying effect of collaborating across your organization on cohesive definitions and business logic. - ❌ **ML vs Reporting use cases.** Similarly to the point above, splitting a project up based on different use cases, particularly more standard BI versus ML features, is a common idea. We tend to discourage it for the time being. As with the previous point, a foundational goal of implementing dbt is to create a single source of truth in your organization. The features you’re providing to your data science teams should be coming from the same marts and metrics that serve reports on executive dashboards. There are a growing number of tools like [fal](https://blog.fal.ai/introducing-fal-dbt/) and [Continual.ai](http://Continual.ai) that make excellent use of this unified viewpoint. -- ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own project to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management/#private-packages). -- ✅ **Project size.** At a certain point, your project may grow to have simply too many models to present a viable development experiment. If you have 1000s of models, it absolutely makes sense to find a way to split up your project. +- ✅ **Data governance.** Structural, organizational needs — such as data governance and security — are one of the few worthwhile reasons to split up a project. If, for instance, you work at a healthcare company with only a small team cleared to access raw data with PII in it, you may need to split out your staging models into their own project to preserve those policies. In that case, you would import your staging project into the project that builds on those staging models as a [private package](https://docs.getdbt.com/docs/build/packages/#private-packages). +- ✅ **Project size.** At a certain point, your project may grow to have simply too many models to present a viable development experience. If you have 1000s of models, it absolutely makes sense to find a way to split up your project. ## Final considerations -Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team *how* and *why* you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. +Overall, consistency is more important than any of these specific conventions. As your project grows and your experience with dbt deepens, you will undoubtedly find aspects of the above structure you want to change. While we recommend this approach for the majority of projects, every organization is unique! The only dogmatic advice we’ll put forward here is that when you find aspects of the above structure you wish to change, think intently about your reasoning and document for your team _how_ and _why_ you are deviating from these conventions. To that end, we highly encourage you to fork this guide and add it to your project’s README, wiki, or docs so you can quickly create and customize those artifacts. Finally, we emphasize that this guide is a living document! It will certainly change and grow as dbt and dbt Labs evolve. We invite you to join in — discuss, comment, and contribute regarding suggested changes or new elements to cover. diff --git a/website/docs/guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md b/website/docs/guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md new file mode 100644 index 00000000000..dd695af2602 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/0-how-we-style-our-dbt-projects.md @@ -0,0 +1,29 @@ +--- +title: How we style our dbt projects +id: 0-how-we-style-our-dbt-projects +--- + +## Why does style matter? + +Style might seem like a trivial, surface-level issue, but it's a deeply material aspect of a well-built project. A consistent, clear style enhances readability and makes your project easier to understand and maintain. Highly readable code helps build clear mental models making it easier to debug and extend your project. It's not just a favor to yourself, though; equally importantly, it makes it less effort for others to understand and contribute to your project, which is essential for peer collaboration, open-source work, and onboarding new team members. [A style guide lets you focus on what matters](https://mtlynch.io/human-code-reviews-1/#settle-style-arguments-with-a-style-guide), the logic and impact of your project, rather than the superficialities of how it's written. This brings harmony and pace to your team's work, and makes reviews more enjoyable and valuable. + +## What's important about style? + +There are two crucial tenets of code style: + +- Clarity +- Consistency + +Style your code in such a way that you can quickly read and understand it. It's also important to consider code review and git diffs. If you're making a change to a model, you want reviewers to see just the material changes you're making clearly. + +Once you've established a clear style, stay consistent. This is the most important thing. Everybody on your team needs to have a unified style, which is why having a style guide is so crucial. If you're writing a model, you should be able to look at other models in the project that your teammates have written and read in the same style. If you're writing a macro or a test, you should see the same style as your models. Consistency is key. + +## How should I style? + +You should style the project in a way you and your teammates or collaborators agree on. The most important thing is that you have a style guide and stick to it. This guide is just a suggestion to get you started and to give you a sense of what a style guide might look like. It covers various areas you may want to consider, with suggested rules. It emphasizes lots of whitespace, clarity, clear naming, and comments. + +We believe one of the strengths of SQL is that it reads like English, so we lean into that declarative nature throughout our projects. Even within dbt Labs, though, there are differing opinions on how to style, even a small but passionate contingent of leading comma enthusiasts! Again, the important thing is not to follow this style guide; it's to make _your_ style guide and follow it. Lastly, be sure to include rules, tools, _and_ examples in your style guide to make it as easy as possible for your team to follow. + +## Automation + +Use formatters and linters as much as possible. We're all human, we make mistakes. Not only that, but we all have different preferences and opinions while writing code. Automation is a great way to ensure that your project is styled consistently and correctly and that people can write in a way that's quick and comfortable for them, while still getting perfectly consistent output. diff --git a/website/docs/guides/best-practices/how-we-style/1-how-we-style-our-dbt-models.md b/website/docs/guides/best-practices/how-we-style/1-how-we-style-our-dbt-models.md new file mode 100644 index 00000000000..0157af63cfb --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/1-how-we-style-our-dbt-models.md @@ -0,0 +1,66 @@ +--- +title: How we style our dbt models +id: 1-how-we-style-our-dbt-models +--- + +## Fields and model names + +- 👥 Models should be pluralized, for example, `customers`, `orders`, `products`. +- 🔑 Each model should have a primary key. +- 🔑 The primary key of a model should be named `_id`, for example, `account_id`. This makes it easier to know what `id` is being referenced in downstream joined models. +- 🔑 Keys should be string data types. +- 🔑 Consistency is key! Use the same field names across models where possible. For example, a key to the `customers` table should be named `customer_id` rather than `user_id` or 'id'. +- ❌ Do not use abbreviations or aliases. Emphasize readability over brevity. For example, do not use `cust` for `customer` or `o` for `orders`. +- ❌ Avoid reserved words as column names. +- ➕ Booleans should be prefixed with `is_` or `has_`. +- 🕰️ Timestamp columns should be named `_at`(for example, `created_at`) and should be in UTC. If a different timezone is used, this should be indicated with a suffix (`created_at_pt`). +- 📆 Dates should be named `_date`. For example, `created_date.` +- 🔙 Events dates and times should be past tense — `created`, `updated`, or `deleted`. +- 💱 Price/revenue fields should be in decimal currency (`19.99` for $19.99; many app databases store prices as integers in cents). If a non-decimal currency is used, indicate this with a suffix (`price_in_cents`). +- 🐍 Schema, table and column names should be in `snake_case`. +- 🏦 Use names based on the _business_ terminology, rather than the source terminology. For example, if the source database uses `user_id` but the business calls them `customer_id`, use `customer_id` in the model. +- 🔢 Versions of models should use the suffix `_v1`, `_v2`, etc for consistency (`customers_v1` and `customers_v2`). +- 🗄️ Use a consistent ordering of data types and consider grouping and labeling columns by type, as in the example below. This will minimize join errors and make it easier to read the model, as well as help downstream consumers of the data understand the data types and scan models for the columns they need. We prefer to use the following order: ids, strings, numerics, booleans, dates, and timestamps. + +## Example model + +```sql +with + +source as ( + + select * from {{ source('ecom', 'raw_orders') }} + +), + +renamed as ( + + select + + ---------- ids + id as order_id, + store_id as location_id, + customer as customer_id, + + ---------- strings + status as order_status, + + ---------- numerics + (order_total / 100.0)::float as order_total, + (tax_paid / 100.0)::float as tax_paid, + + ---------- booleans + is_fulfilled, + + ---------- dates + date(order_date) as ordered_date, + + ---------- timestamps + ordered_at + + from source + +) + +select * from renamed +``` diff --git a/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md b/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md new file mode 100644 index 00000000000..8c61e63b888 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/2-how-we-style-our-sql.md @@ -0,0 +1,186 @@ +--- +title: How we style our SQL +id: 2-how-we-style-our-sql +--- + +## Basics + +- ☁️ Use [SQLFluff](https://sqlfluff.com/) to maintain these style rules automatically. + - Customize `.sqlfluff` configuration files to your needs. + - Refer to our [SQLFluff config file](https://github.com/dbt-labs/jaffle-shop-template/blob/main/.sqlfluff) for the rules we use in our own projects. + + - Exclude files and directories by using a standard `.sqlfluffignore` file. Learn more about the syntax in the [.sqlfluffignore syntax docs](https://docs.sqlfluff.com/en/stable/configuration.html#id2). +- 👻 Use Jinja comments (`{# #}`) for comments that should not be included in the compiled SQL. +- ⏭️ Use trailing commas. +- 4️⃣ Indents should be four spaces. +- 📏 Lines of SQL should be no longer than 80 characters. +- ⬇️ Field names, keywords, and function names should all be lowercase. +- 🫧 The `as` keyword should be used explicitly when aliasing a field or table. + +:::info +☁️ dbt Cloud users can use the built-in [SQLFluff Cloud IDE integration](https://docs.getdbt.com/docs/cloud/dbt-cloud-ide/lint-format) to automatically lint and format their SQL. The default style sheet is based on dbt Labs style as outlined in this guide, but you can customize this to fit your needs. No need to setup any external tools, just hit `Lint`! Also, the more opinionated [sqlfmt](http://sqlfmt.com/) formatter is also available if you prefer that style. +::: + +## Fields, aggregations, and grouping + +- 🔙 Fields should be stated before aggregates and window functions. +- 🤏🏻 Aggregations should be executed as early as possible (on the smallest data set possible) before joining to another table to improve performance. +- 🔢 Ordering and grouping by a number (eg. group by 1, 2) is preferred over listing the column names (see [this classic rant](https://www.getdbt.com/blog/write-better-sql-a-defense-of-group-by-1) for why). Note that if you are grouping by more than a few columns, it may be worth revisiting your model design. + +## Joins + +- 👭🏻 Prefer `union all` to `union` unless you explicitly want to remove duplicates. +- 👭🏻 If joining two or more tables, _always_ prefix your column names with the table name. If only selecting from one table, prefixes are not needed. +- 👭🏻 Be explicit about your join type (i.e. write `inner join` instead of `join`). +- 🥸 Avoid table aliases in join conditions (especially initialisms) — it's harder to understand what the table called "c" is as compared to "customers". +- ➡️ Always move left to right to make joins easy to reason about - `right joins` often indicate that you should change which table you select `from` and which one you `join` to. + +## 'Import' CTEs + +- 🔝 All `{{ ref('...') }}` statements should be placed in CTEs at the top of the file. +- 📦 'Import' CTEs should be named after the table they are referencing. +- 🤏🏻 Limit the data scanned by CTEs as much as possible. Where possible, only select the columns you're actually using and use `where` clauses to filter out unneeded data. +- For example: + +```sql +with + +orders as ( + + select + order_id, + customer_id, + order_total, + order_date + + from {{ ref('orders') }} + + where order_date >= '2020-01-01' + +) +``` + +## 'Functional' CTEs + +- ☝🏻 Where performance permits, CTEs should perform a single, logical unit of work. +- 📖 CTE names should be as verbose as needed to convey what they do e.g. `events_joined_to_users` instead of `user_events` (this could be a good model name, but does not describe a specific function or transformation). +- 🌉 CTEs that are duplicated across models should be pulled out into their own intermediate models. Look out for chunks of repeated logic that should be refactored into their own model. +- 🔚 The last line of a model should be a `select *` from your final output CTE. This makes it easy to materialize and audit the output from different steps in the model as you're developing it. You just change the CTE referenced in the `select` statement to see the output from that step. + +## Model configuration + +- 📝 Model-specific attributes (like sort/dist keys) should be specified in the model. +- 📂 If a particular configuration applies to all models in a directory, it should be specified in the `dbt_project.yml` file. +- 👓 In-model configurations should be specified like this for maximum readability: + +```sql +{{ + config( + materialized = 'table', + sort = 'id', + dist = 'id' + ) +}} +``` + +## Example SQL + +```sql +with + +events as ( + + ... + +), + +{# CTE comments go here #} +filtered_events as ( + + ... + +) + +select * from filtered_events +``` + +### Example SQL + +```sql +with + +my_data as ( + + select + field_1, + field_2, + field_3, + cancellation_date, + expiration_date, + start_date + + from {{ ref('my_data') }} + +), + +some_cte as ( + + select + id, + field_4, + field_5 + + from {{ ref('some_cte') }} + +), + +some_cte_agg as ( + + select + id, + sum(field_4) as total_field_4, + max(field_5) as max_field_5 + + from some_cte + + group by 1 + +), + +joined as ( + + select + my_data.field_1, + my_data.field_2, + my_data.field_3, + + -- use line breaks to visually separate calculations into blocks + case + when my_data.cancellation_date is null + and my_data.expiration_date is not null + then expiration_date + when my_data.cancellation_date is null + then my_data.start_date + 7 + else my_data.cancellation_date + end as cancellation_date, + + some_cte_agg.total_field_4, + some_cte_agg.max_field_5 + + from my_data + + left join some_cte_agg + on my_data.id = some_cte_agg.id + + where my_data.field_1 = 'abc' and + ( + my_data.field_2 = 'def' or + my_data.field_2 = 'ghi' + ) + + having count(*) > 1 + +) + +select * from joined +``` diff --git a/website/docs/guides/best-practices/how-we-style/3-how-we-style-our-python.md b/website/docs/guides/best-practices/how-we-style/3-how-we-style-our-python.md new file mode 100644 index 00000000000..5443abf302d --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/3-how-we-style-our-python.md @@ -0,0 +1,44 @@ +--- +title: How we style our Python +id: 3-how-we-style-our-python +--- + +## Python tooling + +- 🐍 Python has a more mature and robust ecosystem for formatting and linting (helped by the fact that it doesn't have a million distinct dialects). We recommend using those tools to format and lint your code in the style you prefer. + +- 🛠️ Our current recommendations are + + - [black](https://pypi.org/project/black/) formatter + - [ruff](https://pypi.org/project/ruff/) linter + + :::info + ☁️ dbt Cloud comes with the [black formatter built-in](https://docs.getdbt.com/docs/cloud/dbt-cloud-ide/lint-format) to automatically lint and format their SQL. You don't need to download or configure anything, just click `Format` in a Python model and you're good to go! + ::: + +## Example Python + +```python +import pandas as pd + + +def model(dbt, session): + # set length of time considered a churn + pd.Timedelta(days=2) + + dbt.config(enabled=False, materialized="table", packages=["pandas==1.5.2"]) + + orders_relation = dbt.ref("stg_orders") + + # converting a DuckDB Python Relation into a pandas DataFrame + orders_df = orders_relation.df() + + orders_df.sort_values(by="ordered_at", inplace=True) + orders_df["previous_order_at"] = orders_df.groupby("customer_id")[ + "ordered_at" + ].shift(1) + orders_df["next_order_at"] = orders_df.groupby("customer_id")["ordered_at"].shift( + -1 + ) + return orders_df +``` diff --git a/website/docs/guides/best-practices/how-we-style/4-how-we-style-our-jinja.md b/website/docs/guides/best-practices/how-we-style/4-how-we-style-our-jinja.md new file mode 100644 index 00000000000..3a969d2bdd3 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/4-how-we-style-our-jinja.md @@ -0,0 +1,37 @@ +--- +title: How we style our Jinja +id: 4-how-we-style-our-jinja +--- + +## Jinja style guide + +- 🫧 When using Jinja delimiters, use spaces on the inside of your delimiter, like `{{ this }}` instead of `{{this}}` +- 🆕 Use newlines to visually indicate logical blocks of Jinja. +- 4️⃣ Indent 4 spaces into a Jinja block to indicate visually that the code inside is wrapped by that block. +- ❌ Don't worry (too much) about Jinja whitespace control, focus on your project code being readable. The time you save by not worrying about whitespace control will far outweigh the time you spend in your compiled code where it might not be perfect. + +## Examples of Jinja style + +```jinja +{% macro make_cool(uncool_id) %} + + do_cool_thing({{ uncool_id }}) + +{% endmacro %} +``` + +```sql +select + entity_id, + entity_type, + {% if this %} + + {{ that }}, + + {% else %} + + {{ the_other_thing }}, + + {% endif %} + {{ make_cool('uncool_id') }} as cool_id +``` diff --git a/website/docs/guides/best-practices/how-we-style/5-how-we-style-our-yaml.md b/website/docs/guides/best-practices/how-we-style/5-how-we-style-our-yaml.md new file mode 100644 index 00000000000..323ed3ac11d --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/5-how-we-style-our-yaml.md @@ -0,0 +1,44 @@ +--- +title: How we style our YAML +id: 5-how-we-style-our-yaml +--- + +## YAML Style Guide + +- 2️⃣ Indents should be two spaces +- ➡️ List items should be indented +- 🆕 Use a new line to separate list items that are dictionaries where appropriate +- 📏 Lines of YAML should be no longer than 80 characters. +- 🛠️ Use the [dbt JSON schema](https://github.com/dbt-labs/dbt-jsonschema) with any compatible IDE and a YAML formatter (we recommend [Prettier](https://prettier.io/) to validate your YAML files and format them automatically. + +:::info +☁️ As with Python and SQL, the dbt Cloud IDE comes with built-in formatting for YAML files (Markdown and JSON too!), via Prettier. Just click the `Format` button and you're in perfect style. As with the other tools, you can [also customize the formatting rules](https://docs.getdbt.com/docs/cloud/dbt-cloud-ide/lint-format#format-yaml-markdown-json) to your liking to fit your company's style guide. +::: + +### Example YAML + +```yaml +version: 2 + +models: + - name: events + columns: + - name: event_id + description: This is a unique identifier for the event + tests: + - unique + - not_null + + - name: event_time + description: "When the event occurred in UTC (eg. 2018-01-01 12:00:00)" + tests: + - not_null + + - name: user_id + description: The ID of the user who recorded the event + tests: + - not_null + - relationships: + to: ref('users') + field: id +``` diff --git a/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md new file mode 100644 index 00000000000..a6402e46870 --- /dev/null +++ b/website/docs/guides/best-practices/how-we-style/6-how-we-style-conclusion.md @@ -0,0 +1,107 @@ +--- +title: Now it's your turn +id: 6-how-we-style-conclusion +--- + +## BYO Styles + +Now that you've seen how we style our dbt projects, it's time to build your own. Feel free to copy this guide and use it as a template for your own project. If you do, we'd love to hear about it! Reach out to us on [the Community Forum](https://discourse.getdbt.com/c/show-and-tell/22) or [Slack](https://www.getdbt.com/community) to share your style guide. We recommend co-locating your style guide with your code to make sure contributors can easily follow it. If you're using GitHub, you can add your style guide to your repository's wiki, or include it in your README. + +## Pre-commit hooks + +Lastly, to ensure your style guide's automated rules are being followed without additional mental overhead to your team, you can use [pre-commit hooks](https://pre-commit.com/) to automatically check your code for style violations (and often fix them automagically) before it's committed. This is a great way to make sure your style guide is followed by all contributors. We recommend implementing this once you've settled on and published your style guide, and your codebase is conforming to it. This will ensure that all future commits follow the style guide. You can find an excellent set of open source pre-commit hooks for dbt from the community [here in the dbt-checkpoint project](https://github.com/dbt-checkpoint/dbt-checkpoint). + +## Style guide template + +```markdown +# dbt Example Style Guide + +## SQL Style + +- Use lowercase keywords. +- Use trailing commas. + +## Model Organization + +Our models (typically) fit into two main categories:\ + +- Staging — Contains models that clean and standardize data. +- Marts — Contains models which combine or heavily transform data. + +Things to note: + +- There are different types of models that typically exist in each of the above categories. See [Model Layers](#model-layers) for more information. +- Read [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) for an example and more details around organization. + +## Model Layers + +- Only models in `staging` should select from [sources](https://docs.getdbt.com/docs/building-a-dbt-project/using-sources). +- Models not in the `staging` folder should select from [refs](https://docs.getdbt.com/reference/dbt-jinja-functions/ref). + +## Model File Naming and Coding + +- All objects should be plural. + Example: `stg_stripe__invoices.sql` vs. `stg_stripe__invoice.sql` + +- All models should use the naming convention `___`. See [this article](https://docs.getdbt.com/blog/stakeholder-friendly-model-names) for more information. + + - Models in the **staging** folder should use the source's name as the `` and the entity name as the `additional_context`. + + Examples: + + - seed_snowflake_spend.csv + - base_stripe\_\_invoices.sql + - stg_stripe\_\_customers.sql + - stg_salesforce\_\_customers.sql + - int_customers\_\_unioned.sql + - fct_orders.sql + +- Schema, table, and column names should be in `snake_case`. + +- Limit the use of abbreviations that are related to domain knowledge. An onboarding employee will understand `current_order_status` better than `current_os`. + +- Use names based on the _business_ rather than the source terminology. + +- Each model should have a primary key to identify the unique row and should be named `_id`. For example, `account_id`. This makes it easier to know what `id` is referenced in downstream joined models. + +- For `base` or `staging` models, columns should be ordered in categories, where identifiers are first and date/time fields are at the end. +- Date/time columns should be named according to these conventions: + + - Timestamps: `_at` + Format: UTC + Example: `created_at` + + - Dates: `_date` + Format: Date + Example: `created_date` + +- Booleans should be prefixed with `is_` or `has_`. + Example: `is_active_customer` and `has_admin_access` + +- Price/revenue fields should be in decimal currency (for example, `19.99` for $19.99; many app databases store prices as integers in cents). If a non-decimal currency is used, indicate this with suffixes. For example, `price_in_cents`. + +- Avoid using reserved words (such as [these](https://docs.snowflake.com/en/sql-reference/reserved-keywords.html) for Snowflake) as column names. + +- Consistency is key! Use the same field names across models where possible. For example, a key to the `customers` table should be named `customer_id` rather than `user_id`. + +## Model Configurations + +- Model configurations at the [folder level](https://docs.getdbt.com/reference/model-configs#configuring-directories-of-models-in-dbt_projectyml) should be considered (and if applicable, applied) first. +- More specific configurations should be applied at the model level [using one of these methods](https://docs.getdbt.com/reference/model-configs#apply-configurations-to-one-model-only). +- Models within the `marts` folder should be materialized as `table` or `incremental`. + - By default, `marts` should be materialized as `table` within `dbt_project.yml`. + - If switching to `incremental`, this should be specified in the model's configuration. + +## Testing + +- At a minimum, `unique` and `not_null` tests should be applied to the expected primary key of each model. + +## CTEs + +For more information about why we use so many CTEs, read [this glossary entry](https://docs.getdbt.com/terms/cte). + +- Where performance permits, CTEs should perform a single, logical unit of work. +- CTE names should be as verbose as needed to convey what they do. +- CTEs with confusing or noteable logic should be commented with SQL comments as you would with any complex functions and should be located above the CTE. +- CTEs duplicated across models should be pulled out and created as their own models. +``` diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md b/website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md new file mode 100644 index 00000000000..209041b1df5 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-1-guide-overview.md @@ -0,0 +1,39 @@ +--- +title: "Materializations best practices" +id: materializations-guide-1-guide-overview +slug: 1-guide-overview +description: Read this guide to understand how using materializations in dbt is a crucial skill for effective analytics engineering. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand how using materializations in dbt is a crucial skill for effective analytics engineering. +--- + +What _really_ happens when you type `dbt build`? Contrary to popular belief, a crack team of microscopic data elves do _not_ construct your data row by row, although the truth feels equally magical. This guide explores the real answer to that question, with an introductory look at the objects that get built into your warehouse, why they matter, and how dbt knows what to build. + +The configurations that tell dbt how to construct these objects are called _materializations,_ and knowing how to use them is a crucial skill for effective analytics engineering. When you’ve completed this guide, you will have that ability to use the three core materializations that cover most common analytics engineering situations. + +:::info +😌 **Materializations abstract away DDL and DML**. Typically in raw SQL- or python-based [data transformation](https://www.getdbt.com/analytics-engineering/transformation/), you have to write specific imperative instructions on how to build or modify your data objects. dbt’s materializations make this declarative, we tell dbt how we want things to be constructed and it figures out how to do that given the unique conditions and qualities of our warehouse. +::: + +### Learning goals + +By the end of this guide you should have a solid understanding of: + +- 🛠️ what **materializations** are +- 👨‍👨‍👧 how the three main materializations that ship with dbt — **table**, **view**, and **incremental** — differ +- 🗺️ **when** and **where** to use specific materializations to optimize your development and production builds +- ⚙️ how to **configure materializations** at various scopes, from an individual model to entire folder + +### Prerequisites + +- 📒 You’ll want to have worked through the [quickstart guide](/quickstarts) and have a project setup to work through these concepts. +- 🏃🏻‍♀️ Concepts like dbt runs, `ref()` statements, and models should be familiar to you. +- 🔧 [**Optional**] Reading through the [How we structure our dbt projects](guides/best-practices/how-we-structure/1-guide-overview) Guide will be beneficial for the last section of this guide, when we review best practices for materializations using the dbt project approach of staging models and marts. + +### Guiding principle + +We’ll explore this in-depth throughout, but the basic guideline is **start as simple as possible**. We’ll follow a tiered approached, only moving up a tier when it’s necessary. + +- 🔍 **Start with a view.** When the view gets too long to _query_ for end users, +- ⚒️ **Make it a table.** When the table gets too long to _build_ in your dbt Jobs, +- 📚 **Build it incrementally.** That is, layer the data on in chunks as it comes in. diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md b/website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md new file mode 100644 index 00000000000..54110b46385 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-2-available-materializations.md @@ -0,0 +1,59 @@ +--- +title: "Available materializations" +id: materializations-guide-2-available-materializations +slug: 2-available-materializations +description: Read this guide to understand the different types of materializations you can create in dbt. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand the different types of materializations you can create in dbt. +--- + +Views and tables and incremental models, oh my! In this section we’ll start getting our hands dirty digging into the three basic materializations that ship with dbt. They are considerably less scary and more helpful than lions, tigers, or bears — although perhaps not as cute (can data be cute? We at dbt Labs think so). We’re going to define, implement, and explore: + +- 🔍 **views** +- ⚒️ **tables** +- 📚 **incremental model** + +:::info +👻 There is a fourth default materialization available in dbt called **ephemeral materialization**. It is less broadly applicable than the other three, and better deployed for specific use cases that require weighing some tradeoffs. We chose to leave it out of this guide and focus on the three materializations that will power 99% of your modeling needs. +::: + +**Views and Tables are the two basic categories** of object that we can create across warehouses. They exist natively as types of objects in the warehouse, as you can see from this screenshot of Snowflake (depending on your warehouse the interface will look a little different). **Incremental models** and other materializations types are a little bit different. They tell dbt to **construct tables in a special way**. + +![Tables and views in the browser on Snowflake.](/img/guides/best-practices/materializations/tables-and-views.png) + +### Views + +- ✅ **The default materialization in dbt**. A starting project has no configurations defined for materializations, which means _everything_ is by default built as a view. +- 👩‍💻 **Store _only the SQL logic_ of the transformation in the warehouse, _not the data_**. As such, they make a great default. They build almost instantly and cost almost nothing to build. +- ⏱️ Always reflect the **most up-to-date** version of the input data, as they’re run freshly every time they’re queried. +- 👎 **Have to be processed every time they’re queried, so slower to return results than a table of the same data.** That also means they can cost more over time, especially if they contain intensive transformations and are queried often. + +### Tables + +- 🏗️ **Tables store the data itself** as opposed to views which store the query logic. This means we can pack all of the transformation compute into a single run. A view is storing a _query_ in the warehouse. Even to preview that data we have to query it. A table is storing the literal rows and columns on disk. +- 🏎️ Querying lets us **access that transformed data directly**, so we get better performance. Tables feel **faster and more responsive** compared to views of the same logic. +- 💸 **Improves compute costs.** Compute is significantly more expensive than storage. So while tables use much more storage, it’s generally an economical tradeoff, as you only pay for the transformation compute when you build a table during a job, rather than every time you query it. +- 🔍 **Ideal for models that get queried regularly**, due to the combination of these qualities. +- 👎 **Limited to the source data that was available when we did our most recent run.** We’re ‘freezing’ the transformation logic into a table. So if we run a model as a table every hour, at 10:59a we still only have data up to 10a, because that was what was available in our source data when we ran the table last at 10a. Only at the next run will the newer data be included in our rebuild. + +### Incremental models + +- 🧱 **Incremental** models build a **table** in **pieces over time**, only adding and updating new or changed records. +- 🏎️  **Builds more quickly** than a regular table of the same logic. +- 🐢 **Initial runs are slow.** Typically we use incremental models on very large datasets, so building the initial table on the full dataset is time consuming and equivalent to the table materialization. +- 👎 **Add complexity.** Incremental models require deeper consideration of layering and timing. +- 👎 Can drift from source data over time. As we’re not processing all of the source data when we run an incremental model, extra effort is required to capture changes to historical data. + +### Comparing the materialization types + +| | view | table | incremental | +| -------------------- | ------------------------------------ | -------------------------------------- | -------------------------------------- | +| 🛠️⌛ **build time** | 💚  fastest — only stores logic | ❤️  slowest — linear to size of data | 💛  medium — builds flexible portion | +| 🛠️💸 **build costs** | 💚  lowest — no data processed | ❤️  highest — all data processed | 💛  medium — some data processed | +| 📊💸 **query costs** | ❤️  higher — reprocess every query | 💚  lower — data in warehouse | 💚  lower — data in warehouse | +| 🍅🌱 **freshness** | 💚  best — up-to-the-minute of query | 💛  moderate — up to most recent build | 💛  moderate — up to most recent build | +| 🧠🤔 **complexity** | 💚 simple - maps to warehouse object | 💚 simple - map to warehouse concept | 💛 moderate - adds logical complexity | + +:::info +🔑 **Time is money.** Notice in the above chart that the time and costs rows contain the same results. This is to highlight that when we’re talking about time in warehouses, we’re talking about compute time, which is the primary driver of costs. +::: diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md b/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md new file mode 100644 index 00000000000..54f4443b600 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-3-configuring-materializations.md @@ -0,0 +1,90 @@ +--- +title: "Configuring materializations" +id: materializations-guide-3-configuring-materializations +slug: 3-configuring-materializations +description: Read this guide to understand how to configure materializations in dbt. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand how to configure materializations in dbt. +--- + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +## Configuring materializations + +Choosing which materialization is as simple as setting any other configuration in dbt. We’ll look first at how we select our materializations for individual models, then at more powerful ways of setting materializations for entire folders of models. + +### Configuring tables and views + +Let’s look at how we can use tables and views to get started with materializations: + +- ⚙️ We can configure an individual model’s materialization using a **Jinja `config` block**, and passing in the **`materialized` argument**. This tells dbt what materialization to use. +- 🚰 The underlying specifics of what is run depends on [which **adapter** you’re using](/docs/supported-data-platforms), but the end results will be equivalent. +- 😌 This is one of the many valuable aspects of dbt: it lets us use a **declarative** approach, specifying the _outcome_ that we want in our code, rather than _specific steps_ to achieve it (the latter is an _imperative_ approach if you want to get computer science-y about it 🤓). +- 🔍 In the below case, we want to create a **view**, and can **declare** that in a **single line of code**. + + + + +```sql + {{ + config( + materialized='view' + ) + }} + + select ... +``` + + + + +```python +def model(dbt, session): + + dbt.config(materialized="view") + + # model logic + + return model_df +``` + + + + +:::info +🐍 **Not all adapters support python yet**, check the [docs here to be sure](/docs/build/python-models#specific-data-platforms) before spending time writing python models. +::: + +- Configuring a model to materialize as a `table` is simple, and the same as a `view` for both SQL and python models. + + + + +```sql +{{ + config( + materialized='table' + ) +}} + +select ... +``` + + + + +```python +def model(dbt, session): + + dbt.config(materialized="table") + + # model logic + + return model_df +``` + + + + +Go ahead and try some of these out! diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md b/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md new file mode 100644 index 00000000000..603cbc8cda1 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-4-incremental-models.md @@ -0,0 +1,159 @@ +--- +title: "Incremental models in-depth" +id: materializations-guide-4-incremental-models +slug: 4-incremental-models +description: Read this guide to understand the incremental models you can create in dbt. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand the incremental models you can create in dbt. +--- + +So far we’ve looked at tables and views, which map to the traditional objects in the data warehouse. As mentioned earlier, incremental models are a little different. This where we start to deviate from this pattern with more powerful and complex materializations. + +- 📚 **Incremental models generate tables.** They physically persist the data itself to the warehouse, just piece by piece. What’s different is **how we build that table**. +- 💅 **Only apply our transformations to rows of data with new or updated information**, this maximizes efficiency. + - 🌍  If we have a very large set of data or compute-intensive transformations, or both, it can be very slow and costly to process the entire corpus of source data being input into a model or chain of models. If instead we can identify _only rows that contain new information_ (that is, **new or updated records**), we then can process just those rows, building our models _incrementally_. +- 3️⃣  We need **3 key things** in order to accomplish the above: + - a **filter** to select just the new or updated records + - a **conditional block** that wraps our filter and only applies it when we want it + - **configuration** that tells dbt we want to build incrementally and helps apply the conditional filter when needed + +Let’s dig into how exactly we can do that in dbt. Let’s say we have an `orders` table that looks like the below: + +| order_id | order_status | customer_id | order_item_id | ordered_at | updated_at | +| -------- | ------------ | ----------- | ------------- | ---------- | ---------- | +| 123 | shipped | 7 | 5791 | 2022-01-30 | 2022-01-30 | +| 234 | confirmed | 15 | 1643 | 2022-01-31 | 2022-01-31 | + +We did our last `dbt build` job on `2022-01-31`, so any new orders since that run won’t appear in our table. When we do our next run (for simplicity let’s say the next day, although for an orders model we’d more realistically run this hourly), we have two options: + +- 🏔️ build the table from the **beginning of time again — a _table materialization_** + - Simple and solid, if we can afford to do it (in terms of time, compute, and money — which are all directly correlated in a cloud warehouse). It’s the easiest and most accurate option. +- 🤏 find a way to run **just new and updated rows since our previous run — _an_ _incremental materialization_** + - If we _can’t_ realistically afford to run the whole table — due to complex transformations or big source data, it takes too long — then we want to build incrementally. We want to just transform and add the row with id 567 below, _not_ the previous two with ids 123 and 456 that are already in the table. + +| order_id | order_status | customer_id | order_item_id | ordered_at | updated_at | +| -------- | ------------ | ----------- | ------------- | ---------- | ---------- | +| 123 | shipped | 7 | 5791 | 2022-01-30 | 2022-01-30 | +| 234 | confirmed | 15 | 1643 | 2022-01-31 | 2022-01-31 | +| 567 | shipped | 61 | 28 | 2022-02-01 | 2022-02-01 | + +### Writing incremental logic + +Let’s think through the information we’d need to build such a model that only processes new and updated data. We would need: + +- 🕜  **a timestamp indicating when a record was last updated**, let’s call it our `updated_at` timestamp, as that’s a typical convention and what we have in our example above. +- ⌛ the **most recent timestamp from this table _in our warehouse_** _—_ that is, the one created by the previous run — to act as a cutoff point. We’ll call the model we’re working in `this`, for ‘this model we’re working in’. + +That would lets us construct logic like this: + +```sql +select * from orders + +where + updated_at > (select max(updated_at) from {{ this }}) +``` + +Let’s break down that `where` clause a bit, because this where the action is with incremental models. Stepping through the code **_right-to-left_** we: + +1. Get our **cutoff.** + 1. Select the `max(updated_at)` timestamp — the **most recent record** + 2. from `{{ this }}` — the table for this model as it exists in the warehouse, as **built in our last run**, + 3. so `max(updated_at) from {{ this }}` the **_most recent record processed in our last run,_** + 4. that’s exactly what we want as a **cutoff**! +2. **Filter** the rows we’re selecting to add in this run. + 1. Use the `updated_at` timestamp from our input, the equivalent column to the one in the warehouse, but in the up-to-the-minute **source data we’re selecting from** and + 2. check if it’s **greater than our cutoff,** + 3. if so it will satisfy our where clause, so we’re **selecting all the rows more recent than our cutoff.** + +This logic would let us isolate and apply our transformations to just the records that have come in since our last run, and I’ve got some great news: that magic `{{ this }}` keyword [does in fact exist in dbt](/reference/dbt-jinja-functions/this), so we can write exactly this logic in our models. + +### Configuring incremental models + +So we’ve found a way to isolate the new rows we need to process. How then do we handle the rest? We still need to: + +- ➕  make sure dbt knows to **_add_ new rows on top** of the existing table in the warehouse, **not replace** it. +- 👉  If there are **updated rows**, we need a way for dbt to know **which rows to update**. +- 🌍  Lastly, if we’re building into a new environment and there’s **no previous run to reference**, or we need to **build the model from scratch.** Put another way, we’ll want a means to skip the incremental logic and transform all of our input data like a regular table if needed. +- 😎 **Visualized below**, we’ve figured out how to get the red ‘new records’ portion selected, but we need to sort out the step to the right, where we stick those on to our model. + +![Diagram visualizing how incremental models work](/img/guides/best-practices/materializations/incremental-diagram.png) + +:::info +😌 Incremental models can be confusing at first, **take your time reviewing** this visual and the previous steps until you have a **clear mental model.** Be patient with yourself. This materialization will become second nature soon, but it’s tough at first. If you’re feeling confused the [dbt Community is here for you on the Forum and Slack](community/join). +::: + +Thankfully dbt has some additional configuration and special syntax just for incremental models. + +First, let’s look at a config block for incremental materialization: + +```sql +{{ + config( + materialized='incremental', + unique_key='order_id' + ) +}} + +select ... +``` + +- 📚 The **`materialized` config** works just like tables and views, we just pass it the value `'incremental'`. +- 🔑 We’ve **added a new config option `unique_key`,** that tells dbt that if it finds a record in our previous run — the data in the warehouse already — with the same unique id (in our case `order_id` for our `orders` table) that exists in the new data we’re adding incrementally, to **update that record instead of adding it as a separate row**. +- 👯 This **hugely broadens the types of data we can build incrementally** from just immutable tables (data where rows only ever get added, never updated) to mutable records (where rows might change over time). As long as we’ve got a column that specifies when records were updated (such as `updated_at` in our example), we can handle almost anything. +- ➕ We’re now **adding records** to the table **and updating existing rows**. That’s 2 of 3 concerns. +- 🆕 We still need to **build the table from scratch** (via `dbt build` or `run` in a job) when necessary — whether because we’re in a new environment so don’t have an initial table to build on, or our model has drifted from the original over time due to data loading latency. +- 🔀 We need to wrap our incremental logic, that is our `where` clause with our `updated_at` cutoff, in a **conditional statement that will only apply it when certain conditions are met**. If you’re thinking this is **a case for a Jinja `{% if %}` statement**, you’re absolutely right! + +### Incremental conditions + +So we’re going to use an **if statement** to apply our cutoff filter **only when certain conditions are met**. We want to apply our cutoff filter _if_ the **following things are true**: + +- ➕  we’ve set the materialization **config** to incremental, +- 🛠️  there is an **existing table** for this model in the warehouse to build on, +- 🙅‍♀️  and the `--full-refresh` **flag was _not_ passed.** + - [full refresh](reference/resource-configs/full_refresh) is a configuration and flag that is specifically designed to let us override the incremental materialization and build a table from scratch again. + +Thankfully, we don’t have to dig into the guts of dbt to sort out each of these conditions individually. + +- ⚙️  dbt provides us with a **macro [`is_incremental`](/docs/build/incremental-models#understanding-the-is_incremental-macro)** that checks all of these conditions for this exact use case. +- 🔀  By **wrapping our cutoff logic** in this macro, it will only get applied when the macro returns true for all of the above conditions. + +Let’s take a look at all these pieces together: + +```sql +{{ + config( + materialized='incremental', + unique_key='order_id' + ) +}} + +select * from orders + +{% if is_incremental() %} + +where + updated_at > (select max(updated_at) from {{ this }}) + +{% endif %} +``` + +Fantastic! We’ve got a working incremental model. On our first run, when there is no corresponding table in the warehouse, `is_incremental` will evaluate to false and we’ll capture the entire table. On subsequent runs is it will evaluate to true and we’ll apply our filter logic, capturing only the newer data. + +### Late arriving facts + +Our last concern specific to incremental models is what to do when data is inevitably loaded in a less-than-perfect way. Sometimes data loaders will, for a variety of reasons, load data late. Either an entire load comes in late, or some rows come in on a load after those with which they should have. The following is best practice for every incremental model to slow down the drift this can cause. + +- 🕐 For example if most of our records for `2022-01-30` come in the raw schema of our warehouse on the morning of `2022-01-31`, but a handful don’t get loaded til `2022-02-02`, how might we tackle that? There will already be `max(updated_at)` timestamps of `2022-01-31` in the warehouse, filtering out those late records. **They’ll never make it to our model.** +- 🪟 To mitigate this, we can add a **lookback window** to our **cutoff** point. By **subtracting a few days** from the `max(updated_at)`, we would capture any late data within the window of what we subtracted. +- 👯 As long as we have a **`unique_key` defined in our config**, we’ll simply update existing rows and avoid duplication. We process more data this way, but in a fixed way, and it keeps our model hewing closer to the source data. + +### Long-term considerations + +Late arriving facts point to the biggest tradeoff with incremental models: + +- 🪢 In addition to extra **complexity**, they also inevitably **drift from the source data over time.** Due to the imperfection of loaders and the reality of late arriving facts, we can’t help but miss some day in-between our incremental runs, and this accumulates. +- 🪟 We can slow this entropy with the lookback window described above — **the longer the window the less efficient the model, but the slower the drift.** It’s important to note it will still occur though, however slowly. If we have a lookback window of 3 days, and a record comes in 4 days late from the loader, we’re still going to miss it. +- 🌍 Thankfully, there is a way we can reset the relationship of the model to the source data. We can run the model with the **`--full-refresh` flag passed** (such as `dbt build --full-refresh -s orders`). As we saw in the `is_incremental` conditions above, that will make our logic return false, and our `where` clause filter will not be applied, running the whole table. +- 🏗️ This will let us **rebuild the entire table from scratch,** a good practice to do regularly **if the size of the data will allow**. +- 📆 A common pattern for incremental models of manageable size is to run a **full refresh on the weekend** (or any low point in activity), either **weekly or monthly**, to consistently reset the drift from late arriving facts. diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md b/website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md new file mode 100644 index 00000000000..a2cb22d5755 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-5-best-practices.md @@ -0,0 +1,74 @@ +--- +title: Best practices for materializations +id: materializations-guide-5-best-practices +slug: 5-best-practices +description: Read this guide to understand the different types of materializations you can create in dbt. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand the different types of materializations you can create in dbt. +--- + +First, let’s consider some properties of various levels of our dbt project and materializations. + +- 🔍 **Views** return the freshest, real-time state of their input data when they’re queried, this makes them ideal as **building blocks** for larger models. + - 🧶  When we’re building a model that stitches lots of other models together, we don’t want to worry about all those models having different states of freshness because they were built into tables at different times. We want all those inputs to give us all the underlying source data available. +- 🤏 **Views** are also great for **small datasets** with minimally intensive logic that we want **near realtime** access to. +- 🛠️ **Tables** are the **most performant** materialization, as they just return the transformed data when they’re queried, with no need to reprocess it. + - 📊  This makes tables great for **things end users touch**, like a mart that services a popular dashboard. + - 💪 Tables are also ideal for **frequently used, compute intensive** transformations. Making a table allows us to ‘freeze’ those transformations in place. +- 📚  **Incremental models** are useful for the **same purposes as tables**, they just enable us to build them on larger datasets, so they can be **built** _and_ **accessed** in a **performant** way. + +### Project-level configuration + +Keeping these principles in mind, we can applying these materializations to a project. Earlier we looked at how to configure an individual model’s materializations. In practice though, we’ll want to set materializations at the folder level, and use individual model configs to override those as needed. This will keep our code DRY and avoid repeating the same config blocks in every model. + +- 📂  In the `dbt_project.yml` we have a `models:` section (by default at the bottom of the file) we can use define various **configurations for entire directories**. +- ⚙️  These are the **same configs that are passed to a `{{ config() }}` block** for individual models, but they get set for _every model in that directory and any subdirectories nested within it_. +- ➕  We demarcate between a folder name and a configuration by using a `+`, so `marketing`, `paid_ads`, and `google` below are folder names, whereas **`+materialized` is a configuration** being applied to those folder and all folders nested below them. +- ⛲  Configurations set in this way **cascade**, the **more specific scope** is the one that will be set. +- 👇🏻  In the example below, all the models in the `marketing` and `paid_ads` folders would be views, but the `google` sub folder would be **tables.** + +```yaml +models: + jaffle_shop: + marketing: + +materialized: view + paid_ads: + google: + +materialized: table +``` + +### Staging views + +We’ll start off simple with staging models. Lets consider some aspects of staging models to determine the ideal materialization strategy: + +- 🙅‍♀️ Staging models are **rarely accessed** directly by our **end users.** +- 🧱 They need to be always up-to-date and in sync with our source data as a **building blocks** for later models +- 🔍  It’s clear we’ll want to keep our **staging models as views**. +- 👍  Since views are the **default materialization** in dbt, we don’t _have_ to do any specific configuration for this. +- 💎  Still, for clarity, it’s a **good idea** to go ahead and **specify the configuration** to be explicit. We’ll want to make sure our `dbt_project.yml` looks like this: + +```yaml +models: + jaffle_shop: + staging: + +materialized: view +``` + +### Table and incremental marts + +As we’ve learned, views store only the logic of the transformation in the warehouse, so our runs take only a couple seconds per model (or less). What happens when we go to query the data though? + +![Long query time from Snowflake](/img/guides/best-practices/materializations/snowflake-query-timing.png) + +Our marts are slow to query! + +Let’s contrast the same aspects of marts that we considered for staging models to assess the best materialization strategy: + +- 📊  Marts are **frequently accessed directly by our end users**, and need to be **performant.** +- ⌛  Can often **function with intermittently refreshed data**, end user decision making in many domains is **fine with hourly or daily data.** +- 🛠️  Given the above properties we’ve got a great use case for **building the data itself** into the warehouse, not the logic. In other words, **a table**. +- ❓ The only decision we need to make with our marts is whether we can **process the whole table at once or do we need to do it in chunks**, that is, are we going to use the `table` materialization or `incremental`. + +:::info +🔑 **Golden Rule of Materializations** Start with models as views, when they take too long to query, make them tables, when the tables take too long to build, make them incremental. +::: diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md b/website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md new file mode 100644 index 00000000000..07811b42594 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-6-examining-builds.md @@ -0,0 +1,50 @@ +--- +title: "Examining our builds" +id: materializations-guide-6-examining-builds +slug: 6-examining-builds +description: Read this guide to understand how to examine your builds in dbt. +displayText: Materializations best practices +hoverSnippet: Read this guide to understand how to examine your builds in dbt. +--- + +## Examining our builds + +- ⌚ dbt keeps track of how **long each model took to build**, when it started, when it finished, its completion status (error, warn, or success), its materialization type, and _much_ more. +- 🖼️ This information is stored in a couple files which dbt calls **artifacts**. +- 📊 Artifacts contain a ton of information in JSON format, so aren’t easy to read, but **dbt Cloud** packages the most useful bits of information into a tidy **visualization** for you. +- ☁️ If you’re not using Cloud, we can still use the output of the **dbt CLI to understand our runs**. + +### Model Timing + +That’s where dbt Cloud’s Model Timing visualization comes in extremely handy. If we’ve set up a [Job](/quickstarts/bigquery) in dbt Cloud to run our models, we can use the Model Timing tab to pinpoint our longest-running models. + +![dbt Cloud's Model Timing diagram](/img/guides/best-practices/materializations/model-timing-diagram.png) + +- 🧵 This view lets us see our **mapped out in threads** (up to 64 threads, we’re currently running with 4, so we get 4 tracks) over time. You can think of **each thread as a lane on a highway**. +- ⌛ We can see above that `customer_status_histories` is **taking by far the most time**, so we may want to go ahead and **make that incremental**. + +If you aren’t using dbt Cloud, that’s okay! We don’t get a fancy visualization out of the box, but we can use the output from the dbt CLI to check our model times, and it’s a great opportunity to become familiar with that output. + +### dbt CLI output + +If you’ve ever run dbt, whether `build`, `test`, `run` or something else, you’ve seen some output like below. Let’s take a closer look at how to read this. + +![CLI output from a dbt build command](/img/guides/best-practices/materializations/dbt-build-output.png) + +- There are two entries per model, the **start** of a model’s build and the **completion**, which will include **how long** the model took to run. The **type** of model is included as well. For example: + +```shell +20:24:51 5 of 10 START sql view model main.stg_products ......... [RUN] +20:24:51 5 of 10 OK created sql view model main.stg_products .... [OK in 0.13s] +``` + +- 5️⃣  On **both rows** we can see that our `stg_products` model is the 5th of 10 objects being built, the timestamp it started at, that it was defined in SQL (as opposed to python), and that it was a view. +- 🆕  On the **first row** we can see the timestamp of when the model **started**. +- ✅  On the **second row** — which does _not_ necessarily come right after, thanks to threads other models can be starting and finishing as this model runs — we see the **completion** entry which adds the **status**, in this case `OK` , and the **time to build**, a lightning-fast 0.13s. That’s not unexpected considering what we know about views. +- 🏎️  **Views should typically take less than a second or two,** it’s tables and incremental models you’ll want to keep a closer eye on with these tools. + +### dbt Artifacts package + +- 🎨  Lastly, when it comes to examining your dbt runs, you’re **not stuck without fancy visuals** if you’re using dbt Core. It’s not set up out-of-the-box, but if you want to introspect your project more deeply, you can use the [dbt Artifacts package](https://github.com/brooklyn-data/dbt_artifacts). +- 👩‍🎨  This provides models you can **visualize for every aspect of your project** at a very granular level. +- ⌚  You can use it to **create your own model timing visualization** in your BI tool, and any other reports you need to keep an eye on your materialization strategy. diff --git a/website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md b/website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md new file mode 100644 index 00000000000..119563b9a50 --- /dev/null +++ b/website/docs/guides/best-practices/materializations/materializations-guide-7-conclusion.md @@ -0,0 +1,14 @@ +--- +title: "Conclusion" +id: materializations-guide-7-conclusion +slug: 7-conclusion +description: Read this conclusion to our guide on using materializations in dbt and how it is a crucial skill for effective analytics engineering. +displayText: Materializations best practices +hoverSnippet: Read this conclusion to our guide on using materializations in dbt and how it is a crucial skill for effective analytics engineering. +--- + +You're now following best practices in your project, and have optimized the materializations of your DAG. You’re equipped with the 3 main materializations that cover almost any analytics engineering situation! + +There are more configs and materializations available, as well as specific materializations for certain platforms and adapters — and like everything with dbt, materializations are extensible, meaning you can create your own [custom materializations](/guides/advanced/creating-new-materializations) for your needs. So this is just the beginning of what you can do with these powerful configurations. + +For the vast majority of users and companies though, tables, views, and incremental models will handle everything you can throw at them. Develop your intuition and expertise for these materializations, and you’ll be well on your way to tackling advanced analytics engineering problems. diff --git a/website/docs/guides/advanced/adapter-development/1-what-are-adapters.md b/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md similarity index 99% rename from website/docs/guides/advanced/adapter-development/1-what-are-adapters.md rename to website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md index 08769a9bc54..0959dbee707 100644 --- a/website/docs/guides/advanced/adapter-development/1-what-are-adapters.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/1-what-are-adapters.md @@ -1,5 +1,5 @@ --- -title: "What are adapters? Why do we need them?" +title: "What are adapters? Why do we need them?" id: "1-what-are-adapters" --- diff --git a/website/docs/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md similarity index 99% rename from website/docs/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md index 271108a620c..28cd8935937 100644 --- a/website/docs/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter.md @@ -41,7 +41,7 @@ When your adapter becomes more popular, and people start using it, you may quick New minor version releases of `dbt-core` may include changes to the Python interface for adapter plugins, as well as new or updated test cases. The maintainers of `dbt-core` will clearly communicate these changes in documentation and release notes, and they will aim for backwards compatibility whenever possible. -Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](core-versions). +Patch releases of `dbt-core` will _not_ include breaking changes to adapter-facing code. For more details, see ["About dbt Core versions"](/docs/dbt-versions/core). ### Versioning and releasing your adapter diff --git a/website/docs/guides/advanced/adapter-development/3-building-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md similarity index 97% rename from website/docs/guides/advanced/adapter-development/3-building-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md index 9b24d0baaba..43826ca4b1d 100644 --- a/website/docs/guides/advanced/adapter-development/3-building-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter.md @@ -9,7 +9,7 @@ Before you build your adapter, we strongly encourage you to first learn dbt as a This guide will walk you through the first creating the necessary adapter classes and macros, and provide some resources to help you validate that your new adapter is working correctly. Once the adapter is passing most of the functional tests (see ["Testing a new adapter"](4-testing-a-new-adapter) -), please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](supported-data-platforms) page by following the steps given in [Documenting your adapter](5-documenting-a-new-adapter). +), please let the community know that is available to use by adding the adapter to the ["Supported Data Platforms"](/docs/supported-data-platforms) page by following the steps given in [Documenting your adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter). For any questions you may have, don't hesitate to ask in the [#adapter-ecosystem](https://getdbt.slack.com/archives/C030A0UF5LM) Slack channel. The community is very helpful and likely has experienced a similar issue as you. @@ -102,7 +102,7 @@ class MyAdapterCredentials(Credentials): There are a few things you can do to make it easier for users when connecting to your database: - Be sure to implement the Credentials' `_connection_keys` method shown above. This method will return the keys that should be displayed in the output of the `dbt debug` command. As a general rule, it's good to return all the arguments used in connecting to the actual database except the password (even optional arguments). -- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](init). See more details [below](#other-files). +- Create a `profile_template.yml` to enable configuration prompts for a brand-new user setting up a connection profile via the [`dbt init` command](/reference/commands/init). See more details [below](#other-files). - You may also want to define an `ALIASES` mapping on your Credentials class to include any config names you want users to be able to use in place of 'database' or 'schema'. For example if everyone using the MyAdapter database calls their databases "collections", you might do: @@ -312,7 +312,7 @@ The following macros must be implemented, but you can override their behavior fo - `drop_schema` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/schema.sql#L12-L20)) - `get_columns_in_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/columns.sql#L1-L8)) (required) - `list_relations_without_caching` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L58-L65)) (required) -- `list_schemas` ([source](hhttps://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40)) +- `list_schemas` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/metadata.sql#L29-L40)) - `rename_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L56-L65)) - `truncate_relation` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/relation.sql#L45-L53)) - `current_timestamp` ([source](https://github.com/dbt-labs/dbt-core/blob/f988f76fccc1878aaf8d8631c05be3e9104b3b9a/core/dbt/include/global_project/macros/adapters/freshness.sql#L1-L8)) (required) @@ -361,7 +361,7 @@ Most modern databases support a majority of the standard SQL spec. There are som The `adapter.dispatch()` macro takes a second argument, `packages`, which represents a set of "search namespaces" in which to find potential implementations of a dispatched macro. This allows users of community-supported adapters to extend or "shim" dispatched macros from common packages, such as `dbt-utils`, with adapter-specific versions in their own project or other installed packages. See: - "Shim" package examples: [`spark-utils`](https://github.com/dbt-labs/spark-utils), [`tsql-utils`](https://github.com/dbt-msft/tsql-utils) -- [`adapter.dispatch` docs](dispatch) +- [`adapter.dispatch` docs](/reference/dbt-jinja-functions/dispatch) #### Overriding adapter methods @@ -409,7 +409,7 @@ This has moved to its own page: ["Testing a new adapter"](4-testing-a-new-adapte ## Documenting your new adapter -This has moved to its own page: ["Documenting a new adapter"](5-documenting-a-new-adapter) +This has moved to its own page: ["Documenting a new adapter"](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) ## Maintaining your new adapter diff --git a/website/docs/guides/advanced/adapter-development/4-testing-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md similarity index 97% rename from website/docs/guides/advanced/adapter-development/4-testing-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md index 2fa0b3aaba3..b1b5072670a 100644 --- a/website/docs/guides/advanced/adapter-development/4-testing-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter.md @@ -27,7 +27,7 @@ The **[`tests` module](https://github.com/dbt-labs/dbt-core/tree/HEAD/core/dbt/t Those utilities allow you to do three basic things: 1. **Quickly set up a dbt "project."** Define project resources via methods such as `models()` and `seeds()`. Use `project_config_update()` to pass configurations into `dbt_project.yml`. -2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail. +2. **Define a sequence of dbt commands.** The most important utility is `run_dbt()`, which returns the [results](/reference/dbt-classes#result-objects) of each dbt command. It takes a list of CLI specifiers (subcommand + flags), as well as an optional second argument, `expect_pass=False`, for cases where you expect the command to fail. 3. **Validate the results of those dbt commands.** For example, `check_relations_equal()` asserts that two database objects have the same structure and content. You can also write your own `assert` statements, by inspecting the results of a dbt command, or querying arbitrary database objects with `project.run_sql()`. You can see the full suite of utilities, with arguments and annotations, in [`util.py`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/tests/util.py). You'll also see them crop up across a number of test cases. While all utilities are intended to be reusable, you won't need all of them for every test. In the example below, we'll show a simple test case that uses only a few utilities. @@ -185,7 +185,7 @@ tests/functional/test_example.py .X [100%] You can find more ways to run tests, along with a full command reference, in the [pytest usage docs](https://docs.pytest.org/how-to/usage.html). -We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](global-configs), such as `DBT_DEBUG` (to show debug-level logs). +We've found the `-s` flag (or `--capture=no`) helpful to print logs from the underlying dbt invocations, and to step into an interactive debugger if you've added one. You can also use environment variables to set [global dbt configs](/reference/global-configs/about-global-configs), such as `DBT_DEBUG` (to show debug-level logs). ## Testing your adapter diff --git a/website/docs/guides/advanced/adapter-development/5-documenting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md similarity index 87% rename from website/docs/guides/advanced/adapter-development/5-documenting-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md index 9565ada14c7..80b994aefb0 100644 --- a/website/docs/guides/advanced/adapter-development/5-documenting-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter.md @@ -8,9 +8,10 @@ If you've already [built](3-building-a-new-adapter), and [tested](4-testing-a-ne ## Making your adapter available Many community members maintain their adapter plugins under open source licenses. If you're interested in doing this, we recommend: + - Hosting on a public git provider (for example, GitHub or Gitlab) - Publishing to [PyPI](https://pypi.org/) -- Adding to the list of ["Supported Data Platforms"](supported-data-platforms#community-supported) (more info below) +- Adding to the list of ["Supported Data Platforms"](/docs/supported-data-platforms#community-supported) (more info below) ## General Guidelines @@ -35,29 +36,25 @@ We ask our adapter maintainers to use the [docs.getdbt.com repo](https://github. To simplify things, assume the reader of this documentation already knows how both dbt and your data platform works. There's already great material for how to learn dbt and the data platform out there. The documentation we're asking you to add should be what a user who is already profiecient in both dbt and your data platform would need to know in order to use both. Effectively that boils down to two things: how to connect, and how to configure. - ## Topics and Pages to Cover - The following subjects need to be addressed across three pages of this docs site to have your data platform be listed on our documentation. After the corresponding pull request is merged, we ask that you link to these pages from your adapter repo's `REAMDE` as well as from your product documentation. To contribute, all you will have to do make the changes listed in the table below. - - - | How To... | File to change within `/website/docs/` | Action | Info to Include | |----------------------|--------------------------------------------------------------|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Connect | `reference/warehouse-profiles/{MY-DATA-PLATOFRM}-profile.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. | -| Configure | `reference/resource-configs/{MY-DATA-PLATOFRM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported | +| Connect | `/docs/core/connect-data-platform/{MY-DATA-PLATFORM}-setup.md` | Create | Give all information needed to define a target in `~/.dbt/profiles.yml` and get `dbt debug` to connect to the database successfully. All possible configurations should be mentioned. | +| Configure | `reference/resource-configs/{MY-DATA-PLATFORM}-configs.md` | Create | What options and configuration specific to your data platform do users need to know? e.g. table distribution and indexing options, column_quoting policy, which incremental strategies are supported | | Discover and Install | `docs/supported-data-platforms.md` | Modify | Is it a vendor- or community- supported adapter? How to install Python adapter package? Ideally with pip and PyPI hosted package, but can also use `git+` link to GitHub Repo | | Add link to sidebar | `website/sidebars.js` | Modify | Add the document id to the correct location in the sidebar menu | -For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-profile.md` and add it to the `website/docs/reference/warehouse-profiles/` directory. +For example say I want to document my new adapter: `dbt-ders`. For the "Connect" page, I will make a new Markdown file, `ders-setup.md` and add it to the `/website/docs/core/connect-data-platform/` directory. + ## Example PRs to add new adapter documentation Below are some recent pull requests made by partners to document their data platform's adapter: - [TiDB](https://github.com/dbt-labs/docs.getdbt.com/pull/1309) - [SingleStore](https://github.com/dbt-labs/docs.getdbt.com/pull/1044) -- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941) \ No newline at end of file +- [Firebolt](https://github.com/dbt-labs/docs.getdbt.com/pull/941) diff --git a/website/docs/guides/advanced/adapter-development/6-promoting-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md similarity index 97% rename from website/docs/guides/advanced/adapter-development/6-promoting-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md index 206179203fd..9bf2f949bef 100644 --- a/website/docs/guides/advanced/adapter-development/6-promoting-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter.md @@ -56,7 +56,8 @@ The final group is where non-slack community engagement becomes important. Twitt Tell a story that engages dbt users and the community. Highlight new use cases and functionality unlocked by the adapter in a way that will resonate with each segment. ### Existing users of your technology who are new to dbt - - Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](https://docs.getdbt.com/docs/about/viewpoint) + +- Provide a general overview of the value dbt will deliver to your users. This can lean on dbt's messaging and talking points which are laid out in the [dbt viewpoint.](/community/resources/viewpoint) - Give examples of a rollout that speaks to the overall value of dbt and your product. ### Users who are already familiar with dbt and the community diff --git a/website/docs/guides/advanced/adapter-development/7-verifying-a-new-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md similarity index 97% rename from website/docs/guides/advanced/adapter-development/7-verifying-a-new-adapter.md rename to website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md index 7fa36585877..6310569dfad 100644 --- a/website/docs/guides/advanced/adapter-development/7-verifying-a-new-adapter.md +++ b/website/docs/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter.md @@ -34,8 +34,8 @@ The adapter verification program aims to quickly indicate to users which adapter 4. assuring that it works for us internally and ideally an existing team using the adapter in production . -Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](supported-data-platforms) page. +Every major & minor version of a adapter will be verified internally and given an official :white_check_mark: (custom emoji coming soon), on the ["Supported Data Platforms"](/docs/supported-data-platforms) page. ## How to get an adapter verified? -We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM). \ No newline at end of file +We envision that data platform vendors will be most interested in having their adapter versions verified, however we are open to community adapter verification. If interested, please reach out either to the `partnerships` at `dbtlabs.com` or post in the [#adapter-ecosystem Slack channel](https://getdbt.slack.com/archives/C030A0UF5LM). diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md b/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md new file mode 100644 index 00000000000..9783ec66460 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/adapter-development/8-building-a-trusted-adapter.md @@ -0,0 +1,79 @@ +--- +title: "Building a Trusted Adapter" +id: "8-building-a-trusted-adapter" +--- + +The Trusted adapter program exists to allow adapter maintainers to demonstrate to the dbt community that your adapter is trusted to be used in production. + +## What does it mean to be trusted + +By opting into the below, you agree to this, and we take you at your word. dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the below guidelines not be met. + +### Feature Completeness + +To be considered for the Trusted Adapter program, the adapter must cover the essential functionality of dbt Core given below, with best effort given to support the entire feature set. + +Essential functionality includes (but is not limited to the following features): + +- table, view, and seed materializations +- dbt tests + +The adapter should have the required documentation for connecting and configuring the adapter. The dbt docs site should be the single source of truth for this information. These docs should be kept up-to-date. + +See [Documenting a new adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) for more information. + +### Release Cadence + +Keeping an adapter up-to-date with dbt Core is an integral part of being a trusted adapter. Therefore, we ask that adapter maintainers: + +- Release of new minor versions of the adapter with all tests passing within four weeks of dbt Core's release cut. +- Release of new major versions of the adapter with all tests passing within eight weeks of dbt Core's release cut. + +### Community Responsiveness + +On a best effort basis, active participation and engagement with the dbt Community across the following forums: + +- Being responsive to feedback and supporting user enablement in dbt Community’s Slack workspace +- Responding with comments to issues raised in public dbt adapter code repository +- Merging in code contributions from community members as deemed appropriate + +### Security Practices + +Trusted adapters will not do any of the following: + +- Output to logs or file either access credentials information to or data from the underlying data platform itself. +- Make API calls other than those expressly required for using dbt features (adapters may not add additional logging) +- Obfuscate code and/or functionality so as to avoid detection + +Additionally, to avoid supply-chain attacks: + +- Use an automated service to keep Python dependencies up-to-date (such as Dependabot or similar), +- Publish directly to PyPI from the dbt adapter code repository by using trusted CI/CD process (such as GitHub actions) +- Restrict admin access to both the respective code (GitHub) and package (PyPI) repositories +- Identify and mitigate security vulnerabilities by use of a static code analyzing tool (such as Snyk) as part of a CI/CD process + +### Other considerations + +The adapter repository is: + +- open-souce licensed, +- published to PyPI, and +- automatically tests the codebase against dbt Lab's provided adapter test suite + +## How to get an adapter verified? + +Open an issue on the [docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com) using the "Add adapter to Trusted list" template. In addition to contact information, it will ask confirm that you agree to the following. + +1. my adapter meet the guidelines given above +2. I will make best reasonable effort that this continues to be so +3. checkbox: I acknowledge that dbt Labs reserves the right to remove an adapter from the trusted adapter list at any time, should any of the above guidelines not be met. + +The approval workflow is as follows: + +1. create and populate the template-created issue +2. dbt Labs will respond as quickly as possible (maximally four weeks, though likely faster) +3. If approved, dbt Labs will create and merge a Pull request to formally add the adapter to the list. + +## How to get help with my trusted adapter? + +Ask your question in #adapter-ecosystem channel of the community Slack. diff --git a/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development b/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/adapter-development/adapter-development @@ -0,0 +1 @@ + diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md b/website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md new file mode 100644 index 00000000000..8713938db86 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices.md @@ -0,0 +1,61 @@ +# Best practices for dbt and Unity Catalog + +Your Databricks dbt project should be configured after following the ["How to set up your databricks dbt project guide"](how-to-set-up-your-databricks-dbt-project). Now we’re ready to start building a dbt project using Unity Catalog. However, we should first consider how we want to allow dbt users to interact with our different catalogs. We recommend the following best practices to ensure the integrity of your production data: + +## Isolate your Bronze (aka source) data + +We recommend using Unity Catalog because it allows you to reference data across your organization from any other catalog, legacy Hive metastore, external metastore, or Delta Live Table pipeline outputs. Additionally, Databricks offers the capability to [interact with external data](https://docs.databricks.com/external-data/index.html#interact-with-external-data-on-databricks) and supports query federation to many [database solutions](https://docs.databricks.com/query-federation/index.html#what-is-query-federation-for-databricks-sql). This means your dev and prod environments will have access to your source data, even if it is defined in another catalog or external data source. + +Raw data in your Bronze layer should be defined as dbt [sources](https://docs.getdbt.com/docs/build/sources) and should be read-only for all dbt interactions in both development and production. By default, we recommend that all of these inputs should be accessible by all dbt users in all dbt environments. This ensures that transformations in all environments begin with the same input data, and the results observed in development will be replicated when that code is deployed. That being said, there are times when your company’s data governance requirements necessitate using multiple workspaces or data catalogs depending on the environment. + +If you have different data catalogs/schemas for your source data depending on your environment, you can use the [target.name](https://docs.getdbt.com/reference/dbt-jinja-functions/target#use-targetname-to-change-your-source-database) to change the data catalog/schema you’re pulling from depending on the environment. + +If you use multiple Databricks workspaces to isolate development from production, you can use dbt Cloud’s [environment variables](https://docs.getdbt.com/docs/build/environment-variables) in your connection config strings to reference multiple workspaces from one dbt Cloud project. You can also do the same thing for your SQL warehouse so you can have different sizes based on your environments. + +To do so, use dbt's [environment variable syntax](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-environment-variables#special-environment-variables) for Server Hostname of your Databricks workspace URL and HTTP Path for the SQL warehouse in your connection settings. Note that Server Hostname still needs to appear to be a valid domain name to pass validation checks, so you will need to hard-code the domain suffix on the URL, eg `{{env_var('DBT_HOSTNAME')}}.cloud.databricks.com` and the path prefix for your warehouses, eg `/sql/1.0/warehouses/{{env_var('DBT_HTTP_PATH')}}`. + + + +When you create environments in dbt Cloud, you can assign environment variables to populate the connection information dynamically. Don’t forget to make sure the tokens you use in the credentials for those environments were generated from the associated workspace. + + + +## Access Control + +For granting access to data consumers, use dbt’s [grants config](https://docs.getdbt.com/reference/resource-configs/grants) to apply permissions to database objects generated by dbt models. This lets you configure grants as a structured dictionary rather than writing all the SQL yourself and lets dbt take the most efficient path to apply those grants. + +As for permissions to run dbt and read non-consumer-facing data sources, the table below summarizes an access model. Effectively, all developers should get no more than read access on the prod catalog and write access in the dev catalog. When using dbt, schema creation is taken care of for you; unlike traditional data warehousing workflows, you do not need to manually create any Unity Catalog assets other than the top-level catalogs. + +The **prod** service principal should have “read” access to raw source data, and “write” access to the prod catalog. If you add a **test** catalog and associated dbt environment, you should create a dedicated service principal. The test service principal should have *read* on raw source data, and *write* on the **test** catalog but no permissions on the prod or dev catalogs. A dedicated test environment should be used for [CI testing](https://www.getdbt.com/blog/adopting-ci-cd-with-dbt-cloud/) only. + + +**Table-level grants:** + +| | Source Data | Development catalog | Production catalog | Test catalog | +| --- | --- | --- | --- | --- | +| developers | select | select & modify | select or none | none | +| production service principal | select | none | select & modify | none | +| Test service principal | select | none | none | select & modify | + + +**Schema-level grants:** + +| | Source Data | Development catalog | Production catalog | Test catalog | +| --- | --- | --- | --- | --- | +| developers | use | use, create table & create view | use or none | none | +| production service principal | use | none | use, create table & create view | none | +| Test service principal | use | none | none | use, create table & create view | + + +## Next steps + +Ready to start transforming your Unity Catalog datasets with dbt? + +Check out the resources below for guides, tips, and best practices: + +- [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) +- [Self-paced dbt fundamentals training videos](https://courses.getdbt.com/courses/fundamentals) +- [Customizing CI/CD](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/1-cicd-background) & [SQL linting](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/2-lint-on-push) +- [Debugging errors](https://docs.getdbt.com/guides/best-practices/debugging-errors) +- [Writing custom generic tests](https://docs.getdbt.com/guides/best-practices/writing-custom-generic-tests) +- [dbt packages hub](https://hub.getdbt.com/) \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md b/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md new file mode 100644 index 00000000000..b0be39a4273 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project.md @@ -0,0 +1,103 @@ +# How to set up your Databricks and dbt project + + +Databricks and dbt Labs are partnering to help data teams think like software engineering teams and ship trusted data, faster. The dbt-databricks adapter enables dbt users to leverage the latest Databricks features in their dbt project. Hundreds of customers are now using dbt and Databricks to build expressive and reliable data pipelines on the Lakehouse, generating data assets that enable analytics, ML, and AI use cases throughout the business. + +In this guide, we discuss how to set up your dbt project on the Databricks Lakehouse Platform so that it scales from a small team all the way up to a large organization. + +## Configuring the Databricks Environments + +To get started, we will use Databricks’s Unity Catalog. Without it, we would not be able to design separate [environments](https://docs.getdbt.com/docs/collaborate/environments) for development and production per our [best practices](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). It also allows us to ensure the proper access controls have been applied using SQL. You will need to be using the dbt-databricks adapter to use it (as opposed to the dbt-spark adapter). + +We will set up two different *catalogs* in Unity Catalog: **dev** and **prod**. A catalog is a top-level container for *schemas* (previously known as databases in Databricks), which in turn contain tables and views. + +Our dev catalog will be the development environment that analytics engineers interact with through their IDE. Developers should have their own sandbox to build and test objects in without worry of overwriting or dropping a coworker’s work; we recommend creating personal schemas for this purpose. In terms of permissions, they should only have access to the **dev** catalog. + +Only production runs will have access to data in the **prod** catalog. In a future guide, we will discuss a **test** catalog where our continuous integration/continuous deployment (CI/CD) system can run `dbt test`. + +For now, let’s keep things simple and [create two catalogs](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-ddl-create-catalog.html) either using the Data Explorer or in the SQL editor with these commands: + +```sql +create catalog if not exists dev; +create catalog if not exists prod; +``` + +As long as your developer is given write access to the dev data catalog, there is no need to create the sandbox schemas ahead of time. + +## Setting up Service Principals + +When an analytics engineer runs a dbt project from their IDE, it is perfectly fine for the resulting queries to execute with that user’s identity. However, we want production runs to execute with a *service principal's* identity. As a reminder, a service principal is a headless account that does not belong to an actual person. + +Service principals are used to remove humans from deploying to production for convenience and security. Personal identities should not be used to build production pipelines because they could break if the user leaves the company or changes their credentials. Also, there should not be ad hoc commands modifying production data. Only scheduled jobs and running code that has passed CI tests and code reviews should be allowed to modify production data. If something breaks, there is an auditable trail of changes to find the root cause, easily revert to the last working version of the code, and minimize the impact on end users. + +[Let’s create a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) in Databricks: + +1. Have your Databricks Account admin [add a service principal](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-your-databricks-account) to your account. The service principal’s name should differentiate itself from a user ID and make its purpose clear (eg dbt_prod_sp). +2. Add the service principal added to any groups it needs to be a member of at this time. There are more details on permissions in our ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices). +3. [Add the service principal to your workspace](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace) and apply any [necessary entitlements](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#add-a-service-principal-to-a-workspace-using-the-admin-console), such as Databricks SQL access and Workspace access. + +## Setting up Databricks Compute + +When you run a dbt project, it generates SQL, which can run on All Purpose Clusters or SQL warehouses. We strongly recommend running dbt-generated SQL on a Databricks SQL warehouse. Since SQL warehouses are optimized for executing SQL queries, you can save on the cost with lower uptime needed for the cluster to run the queries. If you need to debug, you will also have access to a Query Profile. We recommend using a serverless cluster if you want to minimize the time spent on spinning up a cluster and removing the need to change cluster sizes depending on workflows. + +Let’s [create a Databricks SQL warehouse](https://docs.databricks.com/sql/admin/sql-endpoints.html#create-a-sql-warehouse): + +1. Click **SQL Warehouses** in the sidebar. +2. Click *Create SQL Warehouse*. +3. Enter a name for the warehouse. +4. Accept the default warehouse settings or edit them. +5. Click *Create* +6. Configure warehouse permissions to ensure our service principal and developer have the right access. + +We are not covering python in this post but if you want to learn more, check out these [docs](https://docs.getdbt.com/docs/build/python-models#specific-data-platforms). Depending on your workload, you may wish to create a larger SQL Warehouse for production workflows while having a smaller development SQL Warehouse (if you’re not using Serverless SQL Warehouses). + +## Configure your dbt project + +Now that the Databricks components are in place, we can configure our dbt project. This involves connecting dbt to our Databricks SQL warehouse to run SQL queries and using a version control system like GitHub to store our transformation code. + +If you are migrating an existing dbt project from the dbt-spark adapter to dbt-databricks, follow this [migration guide](https://docs.getdbt.com/guides/migration/tools/migrating-from-spark-to-databricks#migration) to switch adapters without needing to update developer credentials and other existing configs. + +If you’re starting a new dbt project, follow the steps below. For a more detailed setup flow, check out our [quickstart guide.](/quickstarts/databricks) + +### Connect dbt to Databricks + +First, you’ll need to connect your dbt project to Databricks so it can send transformation instructions and build objects in Unity Catalog. Follow the instructions for [dbt Cloud](/quickstarts/databricks?step=4) or [Core](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) to configure your project’s connection credentials. + +Each developer must generate their Databricks PAT and use the token in their development credentials. They will also specify a unique developer schema that will store the tables and views generated by dbt runs executed from their IDE. This provides isolated developer environments and ensures data access is fit for purpose. + +Let’s generate a [Databricks personal access token (PAT)](https://docs.databricks.com/sql/user/security/personal-access-tokens.html) for Development: + +1. In Databricks, click on your Databricks username in the top bar and select User Settings in the drop down. +2. On the Access token tab, click Generate new token. +3. Click Generate. +4. Copy the displayed token and click Done. (don’t lose it!) + + +For your development credentials/profiles.yml: + +1. Set your default catalog to dev. +2. Your developer schema should be named after yourself. We recommend dbt_. + +During your first invocation of `dbt run`, dbt will create the developer schema if it doesn't already exist in the dev catalog. + +### Defining your dbt deployment environment + +Last, we need to give dbt a way to deploy code outside of development environments. To do so, we’ll use dbt [environments](https://docs.getdbt.com/docs/collaborate/environments) to define the production targets that end users will interact with. + +Core projects can use [targets in profiles](https://docs.getdbt.com/docs/core/connection-profiles#understanding-targets-in-profiles) to separate environments. [dbt Cloud environments](https://docs.getdbt.com/docs/cloud/develop-in-the-cloud#set-up-and-access-the-cloud-ide) allow you to define environments via the UI and [schedule jobs](/quickstarts/databricks#create-and-run-a-job) for specific environments. + +Let’s set up our deployment environment: + +1. Follow the Databricks instructions to [set up your service principal’s token](https://docs.databricks.com/dev-tools/service-principals.html#use-curl-or-postman). Note that the `lifetime_seconds` will define how long this credential stays valid. You should use a large number here to avoid regenerating tokens frequently and production job failures. +2. Now let’s pop back over to dbt Cloud to fill out the environment fields. Click on environments in the dbt Cloud UI or define a new target in your profiles.yml. +3. Set the Production environment’s *catalog* to the **prod** catalog created above. Provide the [service token](https://docs.databricks.com/administration-guide/users-groups/service-principals.html#manage-access-tokens-for-a-service-principal) for your **prod** service principal and set that as the *token* in your production environment’s deployment credentials. +4. Set the schema to the default for your prod environment. This can be overridden by [custom schemas](https://docs.getdbt.com/docs/build/custom-schemas#what-is-a-custom-schema) if you need to use more than one. +5. Provide your Service Principal token. + +### Connect dbt to your git repository + +Next, you’ll need somewhere to store and version control your code that allows you to collaborate with teammates. Connect your dbt project to a git repository with [dbt Cloud](/quickstarts/databricks#set-up-a-dbt-cloud-managed-repository). [Core](/quickstarts/manual-install#create-a-repository) projects will use the git CLI. + +## Next steps + +Now that your project is configured, you can start transforming your Databricks data with dbt. To help you scale efficiently, we recommend you follow our best practices, starting with the ["Unity Catalog best practices" guide](dbt-unity-catalog-best-practices). diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md b/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md new file mode 100644 index 00000000000..b5389645258 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks.md @@ -0,0 +1,173 @@ +--- +title: How to optimize and troubleshoot dbt models on Databricks +sidebar_label: "How to optimize and troubleshoot dbt models on Databricks" +description: "Learn more about optimizing and troubleshooting your dbt models on Databricks" +--- + + +Continuing our Databricks and dbt guide series from the last [guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project), it’s time to talk about performance optimization. In this follow-up post,  we outline simple strategies to optimize for cost, performance, and simplicity when architecting your data pipelines. We’ve encapsulated these strategies in this acronym-framework: + +- Platform Components +- Patterns & Best Practices +- Performance Troubleshooting + +## 1. Platform Components + +As you start to develop your dbt projects, one of the first decisions you will make is what kind of backend infrastructure to run your models against. Databricks offers SQL warehouses, All-Purpose Compute, and Jobs Compute, each optimized to workloads they are catered to. Our recommendation is to use Databricks SQL warehouses for all your SQL workloads. SQL warehouses are optimized for SQL workloads when compared to other compute options, additionally, they can scale both vertically to support larger workloads and horizontally to support concurrency. Also, SQL warehouses are easier to manage and provide out-of-the-box features such as query history to help audit and optimize your SQL workloads. Between Serverless, Pro, and Classic SQL Warehouse types that Databricks offers, our standard recommendation for you is to leverage Databricks serverless warehouses. You can explore features of these warehouse types in the [Compare features section](https://www.databricks.com/product/pricing/databricks-sql?_gl=1*2rsmlo*_ga*ZmExYzgzZDAtMWU0Ny00N2YyLWFhYzEtM2RhZTQzNTAyZjZi*_ga_PQSEQ3RZQC*MTY3OTYwMDg0Ni4zNTAuMS4xNjc5NjAyMDMzLjUzLjAuMA..&_ga=2.104593536.1471430337.1679342371-fa1c83d0-1e47-47f2-aac1-3dae43502f6b) on the Databricks pricing page. + +With serverless warehouses, you greatly decrease spin-up time waiting for the cluster to warm up and scale time when your cluster needs to horizontally scale. This mitigates the need to keep clusters idle as serverless warehouses will spin up quickly when the workload begins and then spin down when the workload is complete. Plus, serverless warehouses leverage our Photon engine out of the box for optimal performance in both ELT and serving workloads. + +The next step would be to decide how big to make your serverless SQL warehouse. This is not an exact science but these subsections provide you with some quick tips that will drive huge improvements in performance. + +### Sizing your SQL warehouses + +To select the appropriate size of your SQL warehouse, consider the use case and workload you are running and its corresponding latency requirements. You can select a T-shirt size based on the amount of data and auto-scaling based on concurrency needs. A good rule of thumb to follow is to start with a Medium warehouse and work from there. For large and complex workloads, bigger warehouses are the way to go and that won’t necessarily mean higher costs. This is because larger warehouses take a shorter time to complete a unit of work. For example, if a Small warehouse takes an hour to complete a pipeline, it will only take half an hour with a Medium. This linear trend continues as long as there’s enough work for the warehouse to perform. + +### Provision warehouses by workload + +Another technique worth implementing is to provision separate SQL warehouses for building dbt pipelines instead of ad hoc, interactive SQL analysis. This is because the query design patterns and compute usage are different for these two types of workloads. Choose T-shirt sizes based on data volumes and SLAs (scale-up principle), and choose auto-scaling based on concurrency requirements (scale-out principle). For larger deployments, this approach could be expanded to map different workload sizes to multiple “pipeline” warehouses, if needed. On the dbt side, take into account the [number of threads you have](/docs/core/connect-data-platform/connection-profiles#understanding-threads), meaning how many dbt models you can run in parallel. The higher the thread count, the more compute you will require. + +### Configure auto-stop + +Because of the ability of serverless warehouses to spin up in a matter of seconds, setting your auto-stop configuration to a lower threshold will not impact SLAs and end-user experience. From the SQL Workspace UI, the default value is 10 minutes and  you can set it to 5 minutes for a lower threshold with the UI. If you would like more custom settings, you can set the threshold to as low as 1 minute with the [API](https://docs.databricks.com/sql/api/sql-endpoints.html#). + +## 2. Patterns & Best Practices + +Now that we have a solid sense of the infrastructure components, we can shift our focus to best practices and design patterns on pipeline development.  We recommend the staging/intermediate/mart approach which is analogous to the medallion architecture bronze/silver/gold approach that’s recommended by Databricks. Let’s dissect each stage further. + +dbt has guidelines on how you can [structure your dbt project](/guides/best-practices/how-we-structure/1-guide-overview) which you can learn more about. + +### Bronze / Staging Layer: + +There are a few different options for materializing bronze delta tables on Databricks. In the recommended dbt workflow, you should load your flat files into a table first before using dbt to transform on it. To do so, you can use an EL tool to handle this ingestion. + +However, we know this isn't always possible so for data sets in cloud storage, we recommend that you either leverage our `COPY INTO` functionality or stage the external table. In terms of the `COPY INTO` approach, you would have a few different options. The first option would be to run the `COPY INTO` logic as a pre-hook before building your silver/intermediate models. The second option would be to invoke the databricks `COPY INTO` macro with `dbt run-operation` and then subsequently execute your model runs. You can see an example implementation of the [COPY INTO macro](https://github.com/databricks/dbt-databricks/blob/main/docs/databricks-copy-into-macro-aws.md) in the dbt-databricks docs. + +The main benefit of leveraging `COPY INTO` is that it's an incremental operation and it ensures that data is written in Delta format (when we refer to Delta, we are simply referring to the open Parquet tables with a transaction log). If you instead opt to stage an external table, the bronze table retains its raw structure (whether it is CSV, Parquet, JSON, etc.). This would prevent the ability to leverage the performance, reliability, and governance advantages inherent in Delta. Further, external Parquet tables require additional manual work such as running repair operations to ensure new partition metadata is accounted for. Nevertheless, staging external tables could be a feasible option if you are migrating to Databricks from another cloud warehouse system, where you heavily leveraged this functionality. + +### Silver / Intermediate Layer + +Now that we have our bronze table taken care of, we can proceed with the silver layer. + +For cost and performance reasons, many customers opt to implement an incremental pipeline approach. The main benefit with this approach is that you process a lot less data when you insert new records into the silver layer, rather than re-create the table each time with all the data from the bronze layer. However it should be noted that by default, [dbt recommends using views and tables](/guides/best-practices/materializations/1-guide-overview) to start out with and then moving to incremental as you require more performance optimization. + +dbt has an [incremental model materialization](/reference/resource-configs/spark-configs#the-merge-strategy) to facilitate this framework. How this works at a high level is that Databricks will create a temp view with a snapshot of data and then merge that snapshot into the silver table. You can customize the time range of the snapshot to suit your specific use case by configuring the `where` conditional in your `is_incremental` logic. The most straightforward implementation is to merge data using a timestamp that’s later than the current max timestamp in the silver table, but there are certainly valid use cases for increasing the temporal range of the source snapshot. + +While merge should be fairly performant out of the box but if you have particularly tight SLAs, there are some more advanced tuning techniques that you can incorporate into your logic. Let us discuss several examples in further detail. + +### File Compaction + +Most compute engines work best when file sizes are between 32 MB and 256 MB. In Databricks, we take care of optimal file sizing under the hood with our [auto optimize](https://docs.databricks.com/optimizations/auto-optimize.html) features. Auto optimize consists of two distinct features: auto compaction and optimized writes. In Databricks SQL warehouses, optimized writes are enabled by default. We recommend that you [opt in to auto compaction](https://docs.databricks.com/optimizations/auto-optimize.html#when-to-opt-in-to-auto-compaction). + +### Data skipping + +Under the hood, Databricks will naturally [cluster data based on when it was ingested](https://www.databricks.com/blog/2022/11/18/introducing-ingestion-time-clustering-dbr-112.html). Since many queries include timestamps in `where` conditionals, this will naturally lead to a large amount of file skipping for enhanced performance. Nevertheless, if you have other high cardinality columns (basically columns with a large amount of distinct values such as id columns) that are frequently used in `join` keys or `where` conditionals, performance can typically be augmented further by leveraging Z-order. + +The SQL syntax for the Z-Order command is `OPTIMIZE TABLE Z-ORDER BY (col1,col2,col3,etc)`. One caveat to be aware of is that you will rarely want to Z-Order by more than three columns. You will likely want to either run Z-order on run end after your model builds or run Z-Order as a separate scheduled job on a consistent cadence, whether it is daily, weekly, or monthly. + +```sql +config( + +materialized='incremental', + +zorder="column_A" | ["column_A", "column_B"] + +) +``` + +### Analyze Table + +The `ANALYZE TABLE` command ensures that our system has the most up-to-date statistics to select the optimal join plan. You will likely want to either run analyze table posthook after your model builds or run analyze table as a separate scheduled dbt job on a consistent cadence, whether it is daily, weekly, or monthly.  The SQL syntax for this is: + +```sql +ANALYZE TABLE mytable COMPUTE STATISTICS FOR + +COLUMNS col1, col2, col3 +``` + +An important item to clarify is that you will want to prioritize statistics for columns that are frequently used in joins. + +### Vacuum + +When you delete a record from a Delta table, it is a soft delete. What this means is that the record is deleted from the transaction log and is not included in subsequent queries, but the underlying file still remains in cloud storage. If you want to delete the underlying files as well (whether for reducing storage cost or augmenting performance on merges), you can run a vacuum command. The factor you will want to be very cognizant of is restoring older versions of the table. Let’s say  you vacuum a table to delete all unused files that’s older than 7 days. You won’t be  able to restore versions of the table from over 7 days ago that rely on those deleted  files, so use with caution. If/when you choose to leverage vacuum, you will likely want to run vacuum using the dbt functionality [on-run-end](/reference/project-configs/on-run-start-on-run-end) after your model builds or run vacuum as a separate scheduled dbt job on a consistent cadence (whether it is daily, weekly, or monthly) using the dbt [run-operation](/reference/commands/run-operation) command (with the vaccum statement in a macro). + +### Gold / Marts Layer + +Now onto the most final layer — the gold marts that business stakeholders typically interact with from their preferred BI tool. The considerations here will be fairly similar to the silver layer except that these marts are more likely to handling aggregations. Further, you will likely want to be even more intentional about Z-Ordering these tables as SLAs tend to be lower with these direct stakeholder facing tables. + +In addition, these tables are well suited for defining [dbt metrics](/docs/build/metrics) on to ensure simplicity and consistency across your key business KPIs! Using the [dbt_metrics package](https://hub.getdbt.com/dbt-labs/metrics/latest/), you can query the metrics inside of your own dbt project even. With the upcoming Semantic Layer Integration, you can also then query the metrics in any of the partner integrated tools. + +### Filter rows in target and/or source + +It can be done using `incremental_predicates` like in this example: + +```sql +{{ + +config( + +materialized='incremental', + +incremental_strategy = 'merge', + +unique_key = 'id', + +incremental_predicates = [ + +"dbt_internal_target.create_at >= '2023-01-01'", "dbt_internal_source.create_at >= '2023-01-01'"], + +) + +}} +``` + +## 3. Performance Troubleshooting + +Performance troubleshooting refers to the process of identifying and resolving issues that impact the performance of your dbt models and overall data pipelines. By improving the speed and performance of your Lakehouse platform, you will be able to process data faster, process large and complex queries more effectively, and provide faster time to market.  Let’s go into detail the three effective strategies that you can implement. + +### SQL warehouse query profile + +The SQL warehouse query profile is an effective tool found inside the Databricks SQL workspace. It’s used to troubleshoot slow-running queries, optimize query execution plans, and analyze granular metrics to see where compute resources are being spent. The query profile includes these high level capability areas: + +- Detailed information about the three main components of query execution, which are time spent in tasks, number of rows processed, and memory consumption. +- Two types of graphical representations. A tree view to easily spot slow operations at a glance, and a graph view that breaks down how data is transformed across tasks. +- Ability to understand mistakes and performance bottlenecks in queries. + +The three common examples of performance bottlenecks that can be surfaced by the query profile are: + +### Inefficient file pruning + +By default, Databricks Delta tables collect statistics on the _first 32 columns_ defined in your table schema. When transforming data from the Bronze/staging layer to the Silver/intermediate layer, it is advised to reorder your columns to account for these file-level stats and improve overall performance. Move numerical keys and high cardinality query predicates to the left of the 32nd ordinal position, and move strings and complex data types after the 32nd ordinal position of the table. It is worth mentioning that while you can change the default table property to collect statistics on more columns, it will add more overhead as you write files. You may change this default value by using the [table property](https://docs.databricks.com/delta/table-properties.html), `delta.dataSkippingNumIndexedCols`. + +### Full Table Scans + +The Query Profile provides metrics that allow you to identify the presence of full table scans. Full table scans is a query operation that involves scanning the entire table to retrieve records. It can be a performance issue especially for large tables with billions or trillions of rows. This is because scanning an entire table can be time-consuming and resource-intensive, leading to high memory and CPU usage and slower response times. Table layout techniques such as file compaction and Z-Ordering described in the earlier section of this article will help alleviate this problem. + +### Exploding Joins + +The concept of _exploding joins_ refers to a `join` operation that produces a much larger table result set than either of the input tables used, resulting in a Cartesian product. This performance issue can be determined by enabling the verbose mode setting in the Query Profile, by looking at the number of records produced by a join operator. There are several steps you can take to prevent exploding joins. As a first step, make the join conditions more specific to reduce the number of rows that are being matched. Another step is to utilize data preprocessing techniques such as aggregating, filtering, and performing data sampling before the join operation. These techniques can reduce the size of the input tables and help prevent exploding joins. + +### Materialization Best Practices   + +Remember that data is stored as files, so the unit of I/O work is a file, not a row. That’s a lot of work if we’re dealing with TBs of data. Therefore we recommend relying on merge strategy as the recommended strategy for the majority of incremental models. + +Databricks is committed to continuously improving its performance. For example, in Delta and DBSQL, we’ve greatly improved performance of MERGE operations recently with [low-shuffle merge and Photon](https://www.databricks.com/blog/2022/10/17/faster-merge-performance-low-shuffle-merge-and-photon.html). With many future implementations in the pipeline such as deletion vectors for efficient deletes & upserts.Here’s the basic strategies to speed it up: + +1. Only read partitions that are important by pushing down filters to scan source and target using filters in *model* and *incremental_predicates* +2. Only update important rows +3. Improve key lookup by defining only *one* materialized key +4. Only update important columns + +### dbt Cloud Discovery API + +Now you might be wondering, how do you identify opportunities for performance improvement inside of dbt? Well, with each job run, dbt Cloud generates metadata on the timing, configuration, and freshness of models in your dbt project. The [dbt Discovery API](/docs/dbt-cloud-apis/discovery-api) is a GraphQL service that supports queries on this metadata, using  the [graphical explorer](https://metadata.cloud.getdbt.com/graphiql) or the endpoint itself. Teams can pipe this data into their data warehouse and analyze it like any other data source in a business intelligence platform. dbt Cloud users can also use the data from the [Model Timing tab](/docs/deploy/run-visibility#model-timing) to visually identify models that take the most time and may require refactoring. + +### dbt Cloud Admin API + +With the [dbt Cloud Admin API](/docs/dbt-cloud-apis/admin-cloud-api), you can  pull the dbt artifacts from your dbt Cloud run,  put the generated `manifest.json` into an S3 bucket, stage it, and model the data using the [dbt artifacts package](https://hub.getdbt.com/brooklyn-data/dbt_artifacts/latest/). That package can help you identify inefficiencies in your dbt models and pinpoint where opportunities for improvement are. + +## Conclusion + +This concludes the second guide in our series on “Working with Databricks and dbt”, following [How to set up your Databricks and dbt Project](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project). + +We welcome you to try these strategies on our example open source TPC-H implementation and to provide us with thoughts/feedback as you start to incorporate these features into production. Looking forward to your feedback on [#db-databricks-and-spark](https://getdbt.slack.com/archives/CNGCW8HKL) Slack channel! diff --git a/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md b/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md new file mode 100644 index 00000000000..a3b4be5a051 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/databricks-guides/productionizing-your-dbt-databricks-project.md @@ -0,0 +1,188 @@ +--- +title: Productionizing your dbt Databricks project +id: "productionizing-your-dbt-databricks-project" +sidebar_label: "Productionizing your dbt Databricks project" +description: "Learn how to deliver models to end users and use best practices to maintain production data" +--- + + +Welcome to the third installment of our comprehensive series on optimizing and deploying your data pipelines using Databricks and dbt Cloud. In this guide, we'll dive into delivering these models to end users while incorporating best practices to ensure that your production data remains reliable and timely. + +## Prerequisites + +If you don't have any of the following requirements, refer to the instructions in the [setup guide](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) to catch up: + +- You have [set up your Databricks and dbt Cloud environments](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project). +- You have [optimized your dbt models for peak performance](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks). +- You have created two catalogs in Databricks: *dev* and *prod*. +- You have created Databricks Service Principal to run your production jobs. +- You have at least one [deployment environment](/docs/deploy/deploy-environments) in dbt Cloud. + +To get started, let's revisit the deployment environment created for your production data. + +### Deployment environments + +In software engineering, environments play a crucial role in allowing engineers to develop and test code without affecting the end users of their software. Similarly, you can design [data lakehouses](https://www.databricks.com/product/data-lakehouse) with separate environments. The _production_ environment includes the relations (schemas, tables, and views) that end users query or use, typically in a BI tool or ML model. + +In dbt Cloud, [environments](/docs/dbt-cloud-environments) come in two flavors: + +- Deployment — Defines the settings used for executing jobs created within that environment. +- Development — Determine the settings used in the dbt Cloud IDE for a particular dbt Cloud project. + +Each dbt Cloud project can have multiple deployment environments, but only one development environment per user. + +## Create and schedule a production job + +With your deployment environment set up, it's time to create a production job to run in your *prod* environment. + +To deploy our data transformation workflows, we will utilize [dbt Cloud’s built-in job scheduler](/docs/deploy/deploy-jobs). The job scheduler is designed specifically to streamline your dbt project deployments and runs, ensuring that your data pipelines are easy to create, monitor, and modify efficiently. + +Leveraging dbt Cloud's job scheduler allows data teams to own the entire transformation workflow. You don't need to learn and maintain additional tools for orchestration or rely on another team to schedule code written by your team. This end-to-end ownership simplifies the deployment process and accelerates the delivery of new data products. + +Let’s [create a job](/docs/deploy/deploy-jobs#create-and-schedule-jobs) in dbt Cloud that will transform data in our Databricks *prod* catalog. + +1. Create a new job by clicking **Deploy** in the header, click **Jobs** and then **Create job**. +2. **Name** the job “Daily refresh”. +3. Set the **Environment** to your *production* environment. + - This will allow the job to inherit the catalog, schema, credentials, and environment variables defined in the [setup guide](https://docs.getdbt.com/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project#defining-your-dbt-deployment-environment). +4. Under **Execution Settings** + - Check the **Generate docs on run** checkbox to configure the job to automatically generate project docs each time this job runs. This will ensure your documentation stays evergreen as models are added and modified. + - Select the **Run on source freshness** checkbox to configure dbt [source freshness](/docs/deploy/source-freshness) as the first step of this job. Your sources will need to be configured to [snapshot freshness information](/docs/build/sources#snapshotting-source-data-freshness) for this to drive meaningful insights. + + Add the following three **Commands:** + - `dbt source freshness` + - This will check if any sources are stale. We don’t want to recompute models with data that hasn’t changed since our last run. + - `dbt test --models source:*` + - This will test the data quality our source data, such as checking making sure ID fields are unique and not null. We don’t want bad data getting into production models. + - `dbt build --exclude source:* --fail-fast` + - dbt build is more efficient than issuing separate commands for dbt run and dbt test separately because it will run then test each model before continuing. + - We are excluding source data because we already tested it in step 2. + - The fail-fast flag will make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models. +5. Under **Triggers**, use the toggle to configure your job to [run on a schedule](/docs/deploy/deploy-jobs#schedule-days). You can enter specific days and timing or create a custom cron schedule. + - If you want your dbt Cloud job scheduled by another orchestrator, like Databricks Workflows, see the [Advanced Considerations](#advanced-considerations) section below. + +This is just one example of an all-or-nothing command list designed to minimize wasted computing. The [job command list](/docs/deploy/job-commands) and [selectors](/reference/node-selection/syntax) provide a lot of flexibility on how your DAG will execute. You may want to design yours to continue running certain models if others fail. You may want to set up multiple jobs to refresh models at different frequencies. See our [Job Creation Best Practices discourse](https://discourse.getdbt.com/t/job-creation-best-practices-in-dbt-cloud-feat-my-moms-lasagna/2980) for more job design suggestions. + +After your job is set up and runs successfully, configure your **[project artifacts](/docs/deploy/artifacts)** to make this job inform your production docs site and data sources dashboard that can be reached from the UI. + +This will be our main production job to refresh data that will be used by end users. Another job everyone should include in their dbt project is a continuous integration job. + +### Add a CI job + +CI/CD, or Continuous Integration and Continuous Deployment/Delivery, has become a standard practice in software development for rapidly delivering new features and bug fixes while maintaining high quality and stability. dbt Cloud enables you to apply these practices to your data transformations. + +The steps below show how to create a CI test for your dbt project. CD in dbt Cloud requires no additional steps, as your jobs will automatically pick up the latest changes from the branch assigned to the environment your job is running in. You may choose to add steps depending on your deployment strategy. If you want to dive deeper into CD options, check out [this blog on adopting CI/CD with dbt Cloud](https://www.getdbt.com/blog/adopting-ci-cd-with-dbt-cloud/). + +dbt allows you to write [tests](/docs/build/tests) for your data pipeline, which can be run at every step of the process to ensure the stability and correctness of your data transformations. The main places you’ll use your dbt tests are: + +1. **Daily runs:** Regularly running tests on your data pipeline helps catch issues caused by bad source data, ensuring the quality of data that reaches your users. +2. **Development**: Running tests during development ensures that your code changes do not break existing assumptions, enabling developers to iterate faster by catching problems immediately after writing code. +3. **CI checks**: Automated CI jobs run and test your pipeline end-to end when a pull request is created, providing confidence to developers, code reviewers, and end users that the proposed changes are reliable and will not cause disruptions or data quality issues + +Your CI job will ensure that the models build properly and pass any tests applied to them. We recommend creating a separate *test* environment and having a dedicated service principal. This will ensure the temporary schemas created during CI tests are in their own catalog and cannot unintentionally expose data to other users. Repeat the [steps](/guides/dbt-ecosystem/databricks-guides/how-to-set-up-your-databricks-dbt-project) used to create your *prod* environment to create a *test* environment. After setup, you should have: + +- A catalog called *test* +- A service principal called *dbt_test_sp* +- A new dbt Cloud environment called *test* that defaults to the *test* catalog and uses the *dbt_test_sp* token in the deployment credentials + +We recommend setting up a dbt Cloud CI job. This will decrease the job’s runtime by running and testing only modified models, which also reduces compute spend on the lakehouse. To create a CI job, refer to [Set up CI jobs](/docs/deploy/ci-jobs) for details. + +With dbt tests and SlimCI, you can feel confident that your production data will be timely and accurate even while delivering at high velocity. + +### Monitor your jobs + +Keeping a close eye on your dbt Cloud jobs is crucial for maintaining a robust and efficient data pipeline. By monitoring job performance and quickly identifying potential issues, you can ensure that your data transformations run smoothly. dbt Cloud provides three entry points to monitor the health of your project: run history, deployment monitor, and status tiles. + +The [run history](/docs/deploy/run-visibility#run-history) dashboard in dbt Cloud provides a detailed view of all your project's job runs, offering various filters to help you focus on specific aspects. This is an excellent tool for developers who want to check recent runs, verify overnight results, or track the progress of running jobs. To access it, select **Run History** from the **Deploy** menu. + +The deployment monitor in dbt Cloud offers a higher-level view of your run history, enabling you to gauge the health of your data pipeline over an extended period of time. This feature includes information on run durations and success rates, allowing you to identify trends in job performance, such as increasing run times or more frequent failures. The deployment monitor also highlights jobs in progress, queued, and recent failures. To access the deployment monitor click on the dbt logo in the top left corner of the dbt Cloud UI. + + + +By adding [status tiles](/docs/deploy/dashboard-status-tiles) to your BI dashboards, you can give stakeholders visibility into the health of your data pipeline without leaving their preferred interface. Status tiles instill confidence in your data and help prevent unnecessary inquiries or context switching. To implement dashboard status tiles, you'll need to have dbt docs with [exposures](/docs/build/exposures) defined. + +### Notifications + +Setting up [notifications](/docs/deploy/job-notifications) in dbt Cloud allows you to receive alerts via email or a Slack channel whenever a run ends. This ensures that the appropriate teams are notified and can take action promptly when jobs fail or are canceled. To set up notifications: + +1. Navigate to your dbt Cloud project settings. +2. Select the **Notifications** tab. +3. Choose the desired notification type (Email or Slack) and configure the relevant settings. + +If you require notifications through other means than email or Slack, you can use dbt Cloud's outbound [webhooks](/docs/deploy/webhooks) feature to relay job events to other tools. Webhooks enable you to [integrate dbt Cloud with a wide range of SaaS applications](/guides/orchestration/webhooks), extending your pipeline’s automation into other systems. + +### Troubleshooting + +When a disruption occurs in your production pipeline, it's essential to know how to troubleshoot issues effectively to minimize downtime and maintain a high degree of trust with your stakeholders. + +The five key steps for troubleshooting dbt Cloud issues are: + +1. Read the error message: dbt error messages usually indicate the error type and the file where the issue occurred. +2. Inspect the problematic file and look for an immediate fix. +3. Isolate the problem by running one model at a time in the IDE or undoing the code that caused the issue. +4. Check for problems in compiled files and logs. + +Consult the [Debugging errors documentation](/guides/best-practices/debugging-errors) for a comprehensive list of error types and diagnostic methods. + +To troubleshoot issues with a dbt Cloud job, navigate to the "Deploy > Run History" tab in your dbt Cloud project and select the failed run. Then, expand the run steps to view [console and debug logs](/docs/deploy/run-visibility#access-logs) to review the detailed log messages. To obtain additional information, open the Artifacts tab and download the compiled files associated with the run. + +If your jobs are taking longer than expected, use the [model timing](/docs/deploy/run-visibility#model-timing) dashboard to identify bottlenecks in your pipeline. Analyzing the time taken for each model execution helps you pinpoint the slowest components and optimize them for better performance. The Databricks [Query History](https://docs.databricks.com/sql/admin/query-history.html) lets you inspect granular details such as time spent in each task, rows returned, I/O performance, and execution plan. + +For more on performance tuning, see our guide on [How to Optimize and Troubleshoot dbt Models on Databricks](/guides/dbt-ecosystem/databricks-guides/how_to_optimize_dbt_models_on_databricks). + +## Advanced considerations + +As you become more experienced with dbt Cloud and Databricks, you might want to explore advanced techniques to further enhance your data pipeline and improve the way you manage your data transformations. The topics in this section are not requirements but will help you harden your production environment for greater security, efficiency, and accessibility. + +### Refreshing your data with Databricks Workflows + +The dbt Cloud job scheduler offers several ways to trigger your jobs. If your dbt transformations are just one step of a larger orchestration workflow, use the dbt Cloud API to trigger your job from Databricks Workflows. + +This is a common pattern for analytics use cases that want to minimize latency between ingesting bronze data into the lakehouse with a notebook, transforming that data into gold tables with dbt, and refreshing a dashboard. It is also useful for data science teams who use dbt for feature extraction before using the updated feature store to train and register machine learning models with MLflow. + +The API enables integration between your dbt Cloud jobs and the Databricks workflow, ensuring that your data transformations are effectively managed within the broader context of your data processing pipeline. + +Inserting dbt Cloud jobs into a Databricks Workflows allows you to chain together external tasks while still leveraging these benefits of dbt Cloud: + +- UI Context: The dbt Cloud UI enables you to define the job within the context of your dbt Cloud environments, making it easier to create and manage relevant configs. +- Logs and Run History: Accessing logs and run history becomes more convenient when using dbt Cloud. +- Monitoring and Notification Features: dbt Cloud comes equipped with monitoring and notification features like the ones described above that can help you stay informed about the status and performance of your jobs. + +To trigger your dbt Cloud job from Databricks, follow the instructions in our [Databricks Workflows to run dbt Cloud jobs guide](/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs). + +### Data masking + +Our [Best Practices for dbt and Unity Catalog](/guides/dbt-ecosystem/databricks-guides/dbt-unity-catalog-best-practices) guide recommends using separate catalogs *dev* and *prod* for development and deployment environments, with Unity Catalog and dbt Cloud handling configurations and permissions for environment isolation. Ensuring security while maintaining efficiency in your development and deployment environments is crucial. Additional security measures may be necessary to protect sensitive data, such as personally identifiable information (PII). + +Databricks leverages [Dynamic Views](https://docs.databricks.com/data-governance/unity-catalog/create-views.html#create-a-dynamic-view) to enable data masking based on group membership. Because views in Unity Catalog use Spark SQL, you can implement advanced data masking by using more complex SQL expressions and regular expressions. You can now also apply fine grained access controls like row filters in preview and column masks in preview on tables in Databricks Unity Catalog, which will be the recommended approach to protect sensitive data once this goes GA. Additionally, in the near term, Databricks Unity Catalog will also enable Attribute Based Access Control natively, which will make protecting sensitive data at scale simpler. + +To implement data masking in a dbt model, ensure the model materialization configuration is set to view. Next, add a case statement using the is_account_group_member function to identify groups permitted to view plain text values. Then, use regex to mask data for all other users. For example: + +```sql +CASE +WHEN is_account_group_member('auditors') THEN email +ELSE regexp_extract(email, '^.*@(.*)$', 1) +END +``` + +It is recommended not to grant users the ability to read tables and views referenced in the dynamic view. Instead, assign your dbt sources to dynamic views rather than raw data, allowing developers to run end-to-end builds and source freshness commands securely. + +Using the same sources for development and deployment environments enables testing with the same volumes and frequency you will see in production. However, this may cause development runs to take longer than necessary. To address this issue, consider using the Jinja variable target.name to [limit data when working in the development environment](/reference/dbt-jinja-functions/target#use-targetname-to-limit-data-in-dev). + +## Pairing dbt Docs and Unity Catalog + +Though there are similarities between dbt docs and Databricks Unity Catalog, they are ultimately used for different purposes and complement each other well. By combining their strengths, you can provide your organization with a robust and user-friendly data management ecosystem. + +dbt docs is a documentation site generated from your dbt project that provides an interface for developers and non-technical stakeholders to understand the data lineage and business logic applied to transformations without requiring full access to dbt Cloud or Databricks. It gives you additional options on how you can organize and search for your data. You can automatically [build and view your dbt docs using dbt Cloud](/docs/collaborate/build-and-view-your-docs) to keep the documentation evergreen. + +Unity Catalog is a unified governance solution for your lakehouse. It provides a data explorer that can be used for discovery of datasets that have not been defined in dbt. The data explorer also captures [column-level lineage](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html#capture-and-explore-lineage),  when you need to trace the lineage of a specific column. + +To get the most out of both tools, you can use the [persist docs config](/reference/resource-configs/persist_docs) to push table and column descriptions written in dbt into Unity Catalog, making the information easily accessible to both tools' users. Keeping the descriptions in dbt ensures they are version controlled and can be reproduced after a table is dropped. + +## Additional resources + +- [Advanced deployments course](https://courses.getdbt.com/courses/advanced-deployment) if you want a deeper dive into these topics +- [Autoscaling CI: The intelligent Slim CI](https://docs.getdbt.com/blog/intelligent-slim-ci) +- [Trigger a dbt Cloud Job in your automated workflow with Python](https://discourse.getdbt.com/t/triggering-a-dbt-cloud-job-in-your-automated-workflow-with-python/2573) +- [Databricks + dbt Cloud Quickstart Guide](/quickstarts/databricks) +- Reach out to your Databricks account team to get access to preview features on Databricks. diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md new file mode 100644 index 00000000000..b03cb2ca013 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/1-overview-dbt-python-snowpark.md @@ -0,0 +1,38 @@ +--- +title: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" +id: "1-overview-dbt-python-snowpark" +description: "Leverage dbt Cloud to generate analytics and ML-ready pipelines with SQL and Python with Snowflake" +--- + +The focus of this workshop will be to demonstrate how we can use both *SQL and python together* in the same workflow to run *both analytics and machine learning models* on dbt Cloud. + +All code in today’s workshop can be found on [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1). + +## What you'll use during the lab + +- A [Snowflake account](https://trial.snowflake.com/) with ACCOUNTADMIN access +- A [dbt Cloud account](https://www.getdbt.com/signup/) + +## What you'll learn + +- How to build scalable data transformation pipelines using dbt, and Snowflake using SQL and Python +- How to leverage copying data into Snowflake from a public S3 bucket + +## What you need to know + +- Basic to intermediate SQL and python. +- Basic understanding of dbt fundamentals. We recommend the [dbt Fundamentals course](https://courses.getdbt.com/collections) if you're interested. +- High level machine learning process (encoding, training, testing) +- Simple ML algorithms — we will use logistic regression to keep the focus on the *workflow*, not algorithms! + +## What you'll build + +- A set of data analytics and prediction pipelines using Formula 1 data leveraging dbt and Snowflake, making use of best practices like data quality tests and code promotion between environments +- We will create insights for: + 1. Finding the lap time average and rolling average through the years (is it generally trending up or down)? + 2. Which constructor has the fastest pit stops in 2021? + 3. Predicting the position of each driver given using a decade of data (2010 - 2020) + +As inputs, we are going to leverage Formula 1 datasets hosted on a dbt Labs public S3 bucket. We will create a Snowflake Stage for our CSV files then use Snowflake’s `COPY INTO` function to copy the data in from our CSV files into tables. The Formula 1 is available on [Kaggle](https://www.kaggle.com/datasets/rohanrao/formula-1-world-championship-1950-2020). The data is originally compiled from the [Ergast Developer API](http://ergast.com/mrd/). + +Overall we are going to set up the environments, build scalable pipelines in dbt, establish data tests, and promote code to production. diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md new file mode 100644 index 00000000000..446981214e3 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations.md @@ -0,0 +1,150 @@ +--- +title: "Python transformations!" +id: "10-python-transformations" +description: "Python transformations" +--- + +Up until now, SQL has been driving the project (car pun intended) for data cleaning and hierarchical joining. Now it’s time for Python to take the wheel (car pun still intended) for the rest of our lab! For more information about running Python models on dbt, check out our [docs](/docs/build/python-models). To learn more about dbt python works under the hood, check out [Snowpark for Python](https://docs.snowflake.com/en/developer-guide/snowpark/python/index.html), which makes running dbt Python models possible. + +There are quite a few differences between SQL and Python in terms of the dbt syntax and DDL, so we’ll be breaking our code and model runs down further for our python models. + +## Pit stop analysis + +First, we want to find out: which constructor had the fastest pit stops in 2021? (constructor is a Formula 1 team that builds or “constructs” the car). + +1. Create a new file called `fastest_pit_stops_by_constructor.py` in our `aggregates` (this is the first time we are using the `.py` extension!). +2. Copy the following code into the file: + ```python + import numpy as np + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas","numpy"]) + + # get upstream data + pit_stops_joined = dbt.ref("pit_stops_joined").to_pandas() + + # provide year so we do not hardcode dates + year=2021 + + # describe the data + pit_stops_joined["PIT_STOP_SECONDS"] = pit_stops_joined["PIT_STOP_MILLISECONDS"]/1000 + fastest_pit_stops = pit_stops_joined[(pit_stops_joined["RACE_YEAR"]==year)].groupby(by="CONSTRUCTOR_NAME")["PIT_STOP_SECONDS"].describe().sort_values(by='mean') + fastest_pit_stops.reset_index(inplace=True) + fastest_pit_stops.columns = fastest_pit_stops.columns.str.upper() + + return fastest_pit_stops.round(2) + ``` + +3. Let’s break down what this code is doing step by step: + - First, we are importing the Python libraries that we are using. A *library* is a reusable chunk of code that someone else wrote that you may want to include in your programs/projects. We are using `numpy` and `pandas`in this Python model. This is similar to a dbt *package*, but our Python libraries do *not* persist across the entire project. + - Defining a function called `model` with the parameter `dbt` and `session`. The parameter `dbt` is a class compiled by dbt, which enables you to run your Python code in the context of your dbt project and DAG. The parameter `session` is a class representing your Snowflake’s connection to the Python backend. The `model` function *must return a single DataFrame*. You can see that all the data transformation happening is within the body of the `model` function that the `return` statement is tied to. + - Then, within the context of our dbt model library, we are passing in a configuration of which packages we need using `dbt.config(packages=["pandas","numpy"])`. + - Use the `.ref()` function to retrieve the data frame `pit_stops_joined` that we created in our last step using SQL. We cast this to a pandas dataframe (by default it's a Snowpark Dataframe). + - Create a variable named `year` so we aren’t passing a hardcoded value. + - Generate a new column called `PIT_STOP_SECONDS` by dividing the value of `PIT_STOP_MILLISECONDS` by 1000. + - Create our final data frame `fastest_pit_stops` that holds the records where year is equal to our year variable (2021 in this case), then group the data frame by `CONSTRUCTOR_NAME` and use the `describe()` and `sort_values()` and in descending order. This will make our first row in the new aggregated data frame the team with the fastest pit stops over an entire competition year. + - Finally, it resets the index of the `fastest_pit_stops` data frame. The `reset_index()` method allows you to reset the index back to the default 0, 1, 2, etc indexes. By default, this method will keep the "old" indexes in a column named "index"; to avoid this, use the drop parameter. Think of this as keeping your data “flat and square” as opposed to “tiered”. If you are new to Python, now might be a good time to [learn about indexes for 5 minutes](https://towardsdatascience.com/the-basics-of-indexing-and-slicing-python-lists-2d12c90a94cf) since it's the foundation of how Python retrieves, slices, and dices data. The `inplace` argument means we override the existing data frame permanently. Not to fear! This is what we want to do to avoid dealing with multi-indexed dataframes! + - Convert our Python column names to all uppercase using `.upper()`, so Snowflake recognizes them. + - Finally we are returning our dataframe with 2 decimal places for all the columns using the `round()` method. +4. Zooming out a bit, what are we doing differently here in Python from our typical SQL code: + - Method chaining is a technique in which multiple methods are called on an object in a single statement, with each method call modifying the result of the previous one. The methods are called in a chain, with the output of one method being used as the input for the next one. The technique is used to simplify the code and make it more readable by eliminating the need for intermediate variables to store the intermediate results. + - The way you see method chaining in Python is the syntax `.().()`. For example, `.describe().sort_values(by='mean')` where the `.describe()` method is chained to `.sort_values()`. + - The `.describe()` method is used to generate various summary statistics of the dataset. It's used on pandas dataframe. It gives a quick and easy way to get the summary statistics of your dataset without writing multiple lines of code. + - The `.sort_values()` method is used to sort a pandas dataframe or a series by one or multiple columns. The method sorts the data by the specified column(s) in ascending or descending order. It is the pandas equivalent to `order by` in SQL. + + We won’t go as in depth for our subsequent scripts, but will continue to explain at a high level what new libraries, functions, and methods are doing. + +5. Build the model using the UI which will **execute**: + ```bash + dbt run --select fastest_pit_stops_by_constructor + ``` + in the command bar. + + Let’s look at some details of our first Python model to see what our model executed. There two major differences we can see while running a Python model compared to an SQL model: + + - Our Python model was executed as a stored procedure. Snowflake needs a way to know that it's meant to execute this code in a Python runtime, instead of interpreting in a SQL runtime. We do this by creating a Python stored proc, called by a SQL command. + - The `snowflake-snowpark-python` library has been picked up to execute our Python code. Even though this wasn’t explicitly stated this is picked up by the dbt class object because we need our Snowpark package to run Python! + + Python models take a bit longer to run than SQL models, however we could always speed this up by using [Snowpark-optimized Warehouses](https://docs.snowflake.com/en/user-guide/warehouses-snowpark-optimized.html) if we wanted to. Our data is sufficiently small, so we won’t worry about creating a separate warehouse for Python versus SQL files today. + + + The rest of our **Details** output gives us information about how dbt and Snowpark for Python are working together to define class objects and apply a specific set of methods to run our models. + + So which constructor had the fastest pit stops in 2021? Let’s look at our data to find out! + +6. We can't preview Python models directly, so let’s create a new file using the **+** button or the Control-n shortcut to create a new scratchpad. +7. Reference our Python model: + ```sql + select * from {{ ref('fastest_pit_stops_by_constructor') }} + ``` + and preview the output: + + + Not only did Red Bull have the fastest average pit stops by nearly 40 seconds, they also had the smallest standard deviation, meaning they are both fastest and most consistent teams in pit stops. By using the `.describe()` method we were able to avoid verbose SQL requiring us to create a line of code per column and repetitively use the `PERCENTILE_COUNT()` function. + + Now we want to find the lap time average and rolling average through the years (is it generally trending up or down)? + +8. Create a new file called `lap_times_moving_avg.py` in our `aggregates` folder. +9. Copy the following code into the file: + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"]) + + # get upstream data + lap_times = dbt.ref("int_lap_times_years").to_pandas() + + # describe the data + lap_times["LAP_TIME_SECONDS"] = lap_times["LAP_TIME_MILLISECONDS"]/1000 + lap_time_trends = lap_times.groupby(by="RACE_YEAR")["LAP_TIME_SECONDS"].mean().to_frame() + lap_time_trends.reset_index(inplace=True) + lap_time_trends["LAP_MOVING_AVG_5_YEARS"] = lap_time_trends["LAP_TIME_SECONDS"].rolling(5).mean() + lap_time_trends.columns = lap_time_trends.columns.str.upper() + + return lap_time_trends.round(1) + ``` + +10. Breaking down our code a bit: + - We’re only using the `pandas` library for this model and casting it to a pandas data frame `.to_pandas()`. + - Generate a new column called `LAP_TIMES_SECONDS` by dividing the value of `LAP_TIME_MILLISECONDS` by 1000. + - Create the final dataframe. Get the lap time per year. Calculate the mean series and convert to a data frame. + - Reset the index. + - Calculate the rolling 5 year mean. + - Round our numeric columns to one decimal place. +11. Now, run this model by using the UI **Run model** or + ```bash + dbt run --select lap_times_moving_avg + ``` + in the command bar. + +12. Once again previewing the output of our data using the same steps for our `fastest_pit_stops_by_constructor` model. + + + We can see that it looks like lap times are getting consistently faster over time. Then in 2010 we see an increase occur! Using outside subject matter context, we know that significant rule changes were introduced to Formula 1 in 2010 and 2011 causing slower lap times. + +13. Now is a good time to checkpoint and commit our work to Git. Click **Commit and push** and give your commit a message like `aggregate python models` before moving on. + +## The dbt model, .source(), .ref() and .config() functions + +Let’s take a step back before starting machine learning to both review and go more in-depth at the methods that make running dbt python models possible. If you want to know more outside of this lab’s explanation read the documentation [here](/docs/build/python-models?version=1.3). + +- dbt model(dbt, session). For starters, each Python model lives in a .py file in your models/ folder. It defines a function named `model()`, which takes two parameters: + - dbt — A class compiled by dbt Core, unique to each model, enables you to run your Python code in the context of your dbt project and DAG. + - session — A class representing your data platform’s connection to the Python backend. The session is needed to read in tables as DataFrames and to write DataFrames back to tables. In PySpark, by convention, the SparkSession is named spark, and available globally. For consistency across platforms, we always pass it into the model function as an explicit argument called session. +- The `model()` function must return a single DataFrame. On Snowpark (Snowflake), this can be a Snowpark or pandas DataFrame. +- `.source()` and `.ref()` functions. Python models participate fully in dbt's directed acyclic graph (DAG) of transformations. If you want to read directly from a raw source table, use `dbt.source()`. We saw this in our earlier section using SQL with the source function. These functions have the same execution, but with different syntax. Use the `dbt.ref()` method within a Python model to read data from other models (SQL or Python). These methods return DataFrames pointing to the upstream source, model, seed, or snapshot. +- `.config()`. Just like SQL models, there are three ways to configure Python models: + - In a dedicated `.yml` file, within the `models/` directory + - Within the model's `.py` file, using the `dbt.config()` method + - Calling the `dbt.config()` method will set configurations for your model within your `.py` file, similar to the `{{ config() }} macro` in `.sql` model files: + ```python + def model(dbt, session): + + # setting configuration + dbt.config(materialized="table") + ``` + - There's a limit to how complex you can get with the `dbt.config()` method. It accepts only literal values (strings, booleans, and numeric types). Passing another function or a more complex data structure is not possible. The reason is that dbt statically analyzes the arguments to `.config()` while parsing your model without executing your Python code. If you need to set a more complex configuration, we recommend you define it using the config property in a [YAML file](/reference/resource-properties/config). Learn more about configurations [here](/reference/model-configs). diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md new file mode 100644 index 00000000000..bde163b59db --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep.md @@ -0,0 +1,225 @@ +--- +title: "Machine Learning prep: cleaning, encoding, and splits, oh my!" +id: "11-machine-learning-prep" +description: "Machine Learning prep" +--- +Now that we’ve gained insights and business intelligence about Formula 1 at a descriptive level, we want to extend our capabilities into prediction. We’re going to take the scenario where we censor the data. This means that we will pretend that we will train a model using earlier data and apply it to future data. In practice, this means we’ll take data from 2010-2019 to train our model and then predict 2020 data. + +In this section, we’ll be preparing our data to predict the final race position of a driver. + +At a high level we’ll be: + +- Creating new prediction features and filtering our dataset to active drivers +- Encoding our data (algorithms like numbers) and simplifying our target variable called `position` +- Splitting our dataset into training, testing, and validation + +## ML data prep + +1. To keep our project organized, we’ll need to create two new subfolders in our `ml` directory. Under the `ml` folder, make the subfolders `prep` and `train_predict`. +2. Create a new file under `ml/prep` called `ml_data_prep`. Copy the following code into the file and **Save**. + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"]) + + # get upstream data + fct_results = dbt.ref("fct_results").to_pandas() + + # provide years so we do not hardcode dates in filter command + start_year=2010 + end_year=2020 + + # describe the data for a full decade + data = fct_results.loc[fct_results['RACE_YEAR'].between(start_year, end_year)] + + # convert string to an integer + data['POSITION'] = data['POSITION'].astype(float) + + # we cannot have nulls if we want to use total pit stops + data['TOTAL_PIT_STOPS_PER_RACE'] = data['TOTAL_PIT_STOPS_PER_RACE'].fillna(0) + + # some of the constructors changed their name over the year so replacing old names with current name + mapping = {'Force India': 'Racing Point', 'Sauber': 'Alfa Romeo', 'Lotus F1': 'Renault', 'Toro Rosso': 'AlphaTauri'} + data['CONSTRUCTOR_NAME'].replace(mapping, inplace=True) + + # create confidence metrics for drivers and constructors + dnf_by_driver = data.groupby('DRIVER').sum()['DNF_FLAG'] + driver_race_entered = data.groupby('DRIVER').count()['DNF_FLAG'] + driver_dnf_ratio = (dnf_by_driver/driver_race_entered) + driver_confidence = 1-driver_dnf_ratio + driver_confidence_dict = dict(zip(driver_confidence.index,driver_confidence)) + + dnf_by_constructor = data.groupby('CONSTRUCTOR_NAME').sum()['DNF_FLAG'] + constructor_race_entered = data.groupby('CONSTRUCTOR_NAME').count()['DNF_FLAG'] + constructor_dnf_ratio = (dnf_by_constructor/constructor_race_entered) + constructor_relaiblity = 1-constructor_dnf_ratio + constructor_relaiblity_dict = dict(zip(constructor_relaiblity.index,constructor_relaiblity)) + + data['DRIVER_CONFIDENCE'] = data['DRIVER'].apply(lambda x:driver_confidence_dict[x]) + data['CONSTRUCTOR_RELAIBLITY'] = data['CONSTRUCTOR_NAME'].apply(lambda x:constructor_relaiblity_dict[x]) + + #removing retired drivers and constructors + active_constructors = ['Renault', 'Williams', 'McLaren', 'Ferrari', 'Mercedes', + 'AlphaTauri', 'Racing Point', 'Alfa Romeo', 'Red Bull', + 'Haas F1 Team'] + active_drivers = ['Daniel Ricciardo', 'Kevin Magnussen', 'Carlos Sainz', + 'Valtteri Bottas', 'Lance Stroll', 'George Russell', + 'Lando Norris', 'Sebastian Vettel', 'Kimi Räikkönen', + 'Charles Leclerc', 'Lewis Hamilton', 'Daniil Kvyat', + 'Max Verstappen', 'Pierre Gasly', 'Alexander Albon', + 'Sergio Pérez', 'Esteban Ocon', 'Antonio Giovinazzi', + 'Romain Grosjean','Nicholas Latifi'] + + # create flags for active drivers and constructors so we can filter downstream + data['ACTIVE_DRIVER'] = data['DRIVER'].apply(lambda x: int(x in active_drivers)) + data['ACTIVE_CONSTRUCTOR'] = data['CONSTRUCTOR_NAME'].apply(lambda x: int(x in active_constructors)) + + return data + ``` +3. As usual, let’s break down what we are doing in this Python model: + - We’re first referencing our upstream `fct_results` table and casting it to a pandas dataframe. + - Filtering on years 2010-2020 since we’ll need to clean all our data we are using for prediction (both training and testing). + - Filling in empty data for `total_pit_stops` and making a mapping active constructors and drivers to avoid erroneous predictions + - ⚠️ You might be wondering why we didn’t do this upstream in our `fct_results` table! The reason for this is that we want our machine learning cleanup to reflect the year 2020 for our predictions and give us an up-to-date team name. However, for business intelligence purposes we can keep the historical data at that point in time. Instead of thinking of one table as “one source of truth” we are creating different datasets fit for purpose: one for historical descriptions and reporting and another for relevant predictions. + - Create new confidence features for drivers and constructors + - Generate flags for the constructors and drivers that were active in 2020 +4. Execute the following in the command bar: + ```bash + dbt run --select ml_data_prep + ``` +5. There are more aspects we could consider for this project, such as normalizing the driver confidence by the number of races entered. Including this would help account for a driver’s history and consider whether they are a new or long-time driver. We’re going to keep it simple for now, but these are some of the ways we can expand and improve our machine learning dbt projects. Breaking down our machine learning prep model: + - Lambda functions — We use some lambda functions to transform our data without having to create a fully-fledged function using the `def` notation. So what exactly are lambda functions? + - In Python, a lambda function is a small, anonymous function defined using the keyword "lambda". Lambda functions are used to perform a quick operation, such as a mathematical calculation or a transformation on a list of elements. They are often used in conjunction with higher-order functions, such as `apply`, `map`, `filter`, and `reduce`. + - `.apply()` method — We used `.apply()` to pass our functions into our lambda expressions to the columns and perform this multiple times in our code. Let’s explain apply a little more: + - The `.apply()` function in the pandas library is used to apply a function to a specified axis of a DataFrame or a Series. In our case the function we used was our lambda function! + - The `.apply()` function takes two arguments: the first is the function to be applied, and the second is the axis along which the function should be applied. The axis can be specified as 0 for rows or 1 for columns. We are using the default value of 0 so we aren’t explicitly writing it in the code. This means that the function will be applied to each *row* of the DataFrame or Series. +6. Let’s look at the preview of our clean dataframe after running our `ml_data_prep` model: + + +## Covariate encoding + +In this next part, we’ll be performing covariate encoding. Breaking down this phrase a bit, a *covariate* is a variable that is relevant to the outcome of a study or experiment, and *encoding* refers to the process of converting data (such as text or categorical variables) into a numerical format that can be used as input for a model. This is necessary because most machine learning algorithms can only work with numerical data. Algorithms don’t speak languages, have eyes to see images, etc. so we encode our data into numbers so algorithms can perform tasks by using calculations they otherwise couldn’t. + +🧠 We’ll think about this as : “algorithms like numbers”. + +1. Create a new file under `ml/prep` called `covariate_encoding` copy the code below and save. + ```python + import pandas as pd + import numpy as np + from sklearn.preprocessing import StandardScaler,LabelEncoder,OneHotEncoder + from sklearn.linear_model import LogisticRegression + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas","numpy","scikit-learn"]) + + # get upstream data + data = dbt.ref("ml_data_prep").to_pandas() + + # list out covariates we want to use in addition to outcome variable we are modeling - position + covariates = data[['RACE_YEAR','CIRCUIT_NAME','GRID','CONSTRUCTOR_NAME','DRIVER','DRIVERS_AGE_YEARS','DRIVER_CONFIDENCE','CONSTRUCTOR_RELAIBLITY','TOTAL_PIT_STOPS_PER_RACE','ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR', 'POSITION']] + + # filter covariates on active drivers and constructors + # use fil_cov as short for "filtered_covariates" + fil_cov = covariates[(covariates['ACTIVE_DRIVER']==1)&(covariates['ACTIVE_CONSTRUCTOR']==1)] + + # Encode categorical variables using LabelEncoder + # TODO: we'll update this to both ohe in the future for non-ordinal variables! + le = LabelEncoder() + fil_cov['CIRCUIT_NAME'] = le.fit_transform(fil_cov['CIRCUIT_NAME']) + fil_cov['CONSTRUCTOR_NAME'] = le.fit_transform(fil_cov['CONSTRUCTOR_NAME']) + fil_cov['DRIVER'] = le.fit_transform(fil_cov['DRIVER']) + fil_cov['TOTAL_PIT_STOPS_PER_RACE'] = le.fit_transform(fil_cov['TOTAL_PIT_STOPS_PER_RACE']) + + # Simply target variable "position" to represent 3 meaningful categories in Formula1 + # 1. Podium position 2. Points for team 3. Nothing - no podium or points! + def position_index(x): + if x<4: + return 1 + if x>10: + return 3 + else : + return 2 + + # we are dropping the columns that we filtered on in addition to our training variable + encoded_data = fil_cov.drop(['ACTIVE_DRIVER','ACTIVE_CONSTRUCTOR'],1) + encoded_data['POSITION_LABEL']= encoded_data['POSITION'].apply(lambda x: position_index(x)) + encoded_data_grouped_target = encoded_data.drop(['POSITION'],1) + + return encoded_data_grouped_target + ``` +2. Execute the following in the command bar: + ```bash + dbt run --select covariate_encoding + ``` +3. In this code, we are using a ton of functions from libraries! This is really cool, because we can utilize code other people have developed and bring it into our project simply by using the `import` function. [Scikit-learn](https://scikit-learn.org/stable/), “sklearn” for short, is an extremely popular data science library. Sklearn contains a wide range of machine learning techniques, including supervised and unsupervised learning algorithms, feature scaling and imputation, as well as tools model evaluation and selection. We’ll be using Sklearn for both preparing our covariates and creating models (our next section). +4. Our dataset is pretty small data so we are good to use pandas and `sklearn`. If you have larger data for your own project in mind, consider `dask` or `category_encoders`. +5. Breaking it down a bit more: + - We’re selecting a subset of variables that will be used as predictors for a driver’s position. + - Filter the dataset to only include rows using the active driver and constructor flags we created in the last step. + - The next step is to use the `LabelEncoder` from scikit-learn to convert the categorical variables `CIRCUIT_NAME`, `CONSTRUCTOR_NAME`, `DRIVER`, and `TOTAL_PIT_STOPS_PER_RACE` into numerical values. + - Create a new variable called `POSITION_LABEL`, which is a derived from our position variable. + - 💭 Why are we changing our position variable? There are 20 total positions in Formula 1 and we are grouping them together to simplify the classification and improve performance. We also want to demonstrate you can create a new function within your dbt model! + - Our new `position_label` variable has meaning: + - In Formula1 if you are in: + - Top 3 you get a “podium” position + - Top 10 you gain points that add to your overall season total + - Below top 10 you get no points! + - We are mapping our original variable position to `position_label` to the corresponding places above to 1,2, and 3 respectively. + - Drop the active driver and constructor flags since they were filter criteria and additionally drop our original position variable. + +## Splitting into training and testing datasets + +Now that we’ve cleaned and encoded our data, we are going to further split in by time. In this step, we will create dataframes to use for training and prediction. We’ll be creating two dataframes 1) using data from 2010-2019 for training, and 2) data from 2020 for new prediction inferences. We’ll create variables called `start_year` and `end_year` so we aren’t filtering on hardcasted values (and can more easily swap them out in the future if we want to retrain our model on different timeframes). + +1. Create a file called `train_test_dataset` copy and save the following code: + ```python + import pandas as pd + + def model(dbt, session): + + # dbt configuration + dbt.config(packages=["pandas"], tags="train") + + # get upstream data + encoding = dbt.ref("covariate_encoding").to_pandas() + + # provide years so we do not hardcode dates in filter command + start_year=2010 + end_year=2019 + + # describe the data for a full decade + train_test_dataset = encoding.loc[encoding['RACE_YEAR'].between(start_year, end_year)] + + return train_test_dataset + ``` + +2. Create a file called `hold_out_dataset_for_prediction` copy and save the following code below. Now we’ll have a dataset with only the year 2020 that we’ll keep as a hold out set that we are going to use similar to a deployment use case. + ```python + import pandas as pd + + def model(dbt, session): + # dbt configuration + dbt.config(packages=["pandas"], tags="predict") + + # get upstream data + encoding = dbt.ref("covariate_encoding").to_pandas() + + # variable for year instead of hardcoding it + year=2020 + + # filter the data based on the specified year + hold_out_dataset = encoding.loc[encoding['RACE_YEAR'] == year] + + return hold_out_dataset + ``` +3. Execute the following in the command bar: + ```bash + dbt run --select train_test_dataset hold_out_dataset_for_prediction + ``` + To run our temporal data split models, we can use this syntax in the command line to run them both at once. Make sure you use a *space* [syntax](/reference/node-selection/syntax) between the model names to indicate you want to run both! +4. **Commit and push** our changes to keep saving our work as we go using `ml data prep and splits` before moving on. + +👏 Now that we’ve finished our machine learning prep work we can move onto the fun part — training and prediction! diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md new file mode 100644 index 00000000000..8b353a85fa3 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-testing.md @@ -0,0 +1,251 @@ +--- +title: "Machine Learning: training and prediction " +id: "12-machine-learning-training-prediction" +description: "Machine Learning: training and prediction" +--- + +We’re ready to start training a model to predict the driver’s position. Now is a good time to pause and take a step back and say, usually in ML projects you’ll try multiple algorithms during development and use an evaluation method such as cross validation to determine which algorithm to use. You can definitely do this in your dbt project, but for the content of this lab we’ll have decided on using a logistic regression to predict position (we actually tried some other algorithms using cross validation outside of this lab such as k-nearest neighbors and a support vector classifier but that didn’t perform as well as the logistic regression and a decision tree that overfit). + +There are 3 areas to break down as we go since we are working at the intersection all within one model file: +1. Machine Learning +2. Snowflake and Snowpark +3. dbt Python models + +If you haven’t seen code like this before or use joblib files to save machine learning models, we’ll be going over them at a high level and you can explore the links for more technical in-depth along the way! Because Snowflake and dbt have abstracted away a lot of the nitty gritty about serialization and storing our model object to be called again, we won’t go into too much detail here. There’s *a lot* going on here so take it at your pace! + +## Training and saving a machine learning model + +1. Project organization remains key, so let’s make a new subfolder called `train_predict` under the `ml` folder. +2. Now create a new file called `train_test_position` and copy and save the following code: + + ```python + import snowflake.snowpark.functions as F + from sklearn.model_selection import train_test_split + import pandas as pd + from sklearn.metrics import confusion_matrix, balanced_accuracy_score + import io + from sklearn.linear_model import LogisticRegression + from joblib import dump, load + import joblib + import logging + import sys + from joblib import dump, load + + logger = logging.getLogger("mylog") + + def save_file(session, model, path, dest_filename): + input_stream = io.BytesIO() + joblib.dump(model, input_stream) + session._conn.upload_stream(input_stream, path, dest_filename) + return "successfully created file: " + path + + def model(dbt, session): + dbt.config( + packages = ['numpy','scikit-learn','pandas','numpy','joblib','cachetools'], + materialized = "table", + tags = "train" + ) + # Create a stage in Snowflake to save our model file + session.sql('create or replace stage MODELSTAGE').collect() + + #session._use_scoped_temp_objects = False + version = "1.0" + logger.info('Model training version: ' + version) + + # read in our training and testing upstream dataset + test_train_df = dbt.ref("train_test_dataset") + + # cast snowpark df to pandas df + test_train_pd_df = test_train_df.to_pandas() + target_col = "POSITION_LABEL" + + # split out covariate predictors, x, from our target column position_label, y. + split_X = test_train_pd_df.drop([target_col], axis=1) + split_y = test_train_pd_df[target_col] + + # Split out our training and test data into proportions + X_train, X_test, y_train, y_test = train_test_split(split_X, split_y, train_size=0.7, random_state=42) + train = [X_train, y_train] + test = [X_test, y_test] + # now we are only training our one model to deploy + # we are keeping the focus on the workflows and not algorithms for this lab! + model = LogisticRegression() + + # fit the preprocessing pipeline and the model together + model.fit(X_train, y_train) + y_pred = model.predict_proba(X_test)[:,1] + predictions = [round(value) for value in y_pred] + balanced_accuracy = balanced_accuracy_score(y_test, predictions) + + # Save the model to a stage + save_file(session, model, "@MODELSTAGE/driver_position_"+version, "driver_position_"+version+".joblib" ) + logger.info('Model artifact:' + "@MODELSTAGE/driver_position_"+version+".joblib") + + # Take our pandas training and testing dataframes and put them back into snowpark dataframes + snowpark_train_df = session.write_pandas(pd.concat(train, axis=1, join='inner'), "train_table", auto_create_table=True, create_temp_table=True) + snowpark_test_df = session.write_pandas(pd.concat(test, axis=1, join='inner'), "test_table", auto_create_table=True, create_temp_table=True) + + # Union our training and testing data together and add a column indicating train vs test rows + return snowpark_train_df.with_column("DATASET_TYPE", F.lit("train")).union(snowpark_test_df.with_column("DATASET_TYPE", F.lit("test"))) + ``` + +3. Execute the following in the command bar: + ```bash + dbt run --select train_test_position + ``` +4. Breaking down our Python script here: + - We’re importing some helpful libraries. + - Defining a function called `save_file()` that takes four parameters: `session`, `model`, `path` and `dest_filename` that will save our logistic regression model file. + - `session` — an object representing a connection to Snowflake. + - `model` — an object that needs to be saved. In this case, it's a Python object that is a scikit-learn that can be serialized with joblib. + - `path` — a string representing the directory or bucket location where the file should be saved. + - `dest_filename` — a string representing the desired name of the file. + - Creating our dbt model + - Within this model we are creating a stage called `MODELSTAGE` to place our logistic regression `joblib` model file. This is really important since we need a place to keep our model to reuse and want to ensure it's there. When using Snowpark commands, it's common to see the `.collect()` method to ensure the action is performed. Think of the session as our “start” and collect as our “end” when [working with Snowpark](https://docs.snowflake.com/en/developer-guide/snowpark/python/working-with-dataframes.html) (you can use other ending methods other than collect). + - Using `.ref()` to connect into our `train_test_dataset` model. + - Now we see the machine learning part of our analysis: + - Create new dataframes for our prediction features from our target variable `position_label`. + - Split our dataset into 70% training (and 30% testing), train_size=0.7 with a `random_state` specified to have repeatable results. + - Specify our model is a logistic regression. + - Fit our model. In a logistic regression this means finding the coefficients that will give the least classification error. + - Round our predictions to the nearest integer since logistic regression creates a probability between for each class and calculate a balanced accuracy to account for imbalances in the target variable. + - Right now our model is only in memory, so we need to use our nifty function `save_file` to save our model file to our Snowflake stage. We save our model as a joblib file so Snowpark can easily call this model object back to create predictions. We really don’t need to know much else as a data practitioner unless we want to. It’s worth noting that joblib files aren’t able to be queried directly by SQL. To do this, we would need to transform the joblib file to an SQL querable format such as JSON or CSV (out of scope for this workshop). + - Finally we want to return our dataframe, but create a new column indicating what rows were used for training and those for training. +5. Viewing our output of this model: + + +6. Let’s pop back over to Snowflake and check that our logistic regression model has been stored in our `MODELSTAGE` using the command: + ```sql + list @modelstage + ``` + + +7. To investigate the commands run as part of `train_test_position` script, navigate to Snowflake query history to view it **Activity > Query History**. We can view the portions of query that we wrote such as `create or replace stage MODELSTAGE`, but we also see additional queries that Snowflake uses to interpret python code. + + +## Predicting on new data + +1. Create a new file called `predict_position` and copy and save the following code: + ```python + import logging + import joblib + import pandas as pd + import os + from snowflake.snowpark import types as T + + DB_STAGE = 'MODELSTAGE' + version = '1.0' + # The name of the model file + model_file_path = 'driver_position_'+version + model_file_packaged = 'driver_position_'+version+'.joblib' + + # This is a local directory, used for storing the various artifacts locally + LOCAL_TEMP_DIR = f'/tmp/driver_position' + DOWNLOAD_DIR = os.path.join(LOCAL_TEMP_DIR, 'download') + TARGET_MODEL_DIR_PATH = os.path.join(LOCAL_TEMP_DIR, 'ml_model') + TARGET_LIB_PATH = os.path.join(LOCAL_TEMP_DIR, 'lib') + + # The feature columns that were used during model training + # and that will be used during prediction + FEATURE_COLS = [ + "RACE_YEAR" + ,"CIRCUIT_NAME" + ,"GRID" + ,"CONSTRUCTOR_NAME" + ,"DRIVER" + ,"DRIVERS_AGE_YEARS" + ,"DRIVER_CONFIDENCE" + ,"CONSTRUCTOR_RELAIBLITY" + ,"TOTAL_PIT_STOPS_PER_RACE"] + + def register_udf_for_prediction(p_predictor ,p_session ,p_dbt): + + # The prediction udf + + def predict_position(p_df: T.PandasDataFrame[int, int, int, int, + int, int, int, int, int]) -> T.PandasSeries[int]: + # Snowpark currently does not set the column name in the input dataframe + # The default col names are like 0,1,2,... Hence we need to reset the column + # names to the features that we initially used for training. + p_df.columns = [*FEATURE_COLS] + + # Perform prediction. this returns an array object + pred_array = p_predictor.predict(p_df) + # Convert to series + df_predicted = pd.Series(pred_array) + return df_predicted + + # The list of packages that will be used by UDF + udf_packages = p_dbt.config.get('packages') + + predict_position_udf = p_session.udf.register( + predict_position + ,name=f'predict_position' + ,packages = udf_packages + ) + return predict_position_udf + + def download_models_and_libs_from_stage(p_session): + p_session.file.get(f'@{DB_STAGE}/{model_file_path}/{model_file_packaged}', DOWNLOAD_DIR) + + def load_model(p_session): + # Load the model and initialize the predictor + model_fl_path = os.path.join(DOWNLOAD_DIR, model_file_packaged) + predictor = joblib.load(model_fl_path) + return predictor + + # ------------------------------- + def model(dbt, session): + dbt.config( + packages = ['snowflake-snowpark-python' ,'scipy','scikit-learn' ,'pandas' ,'numpy'], + materialized = "table", + tags = "predict" + ) + session._use_scoped_temp_objects = False + download_models_and_libs_from_stage(session) + predictor = load_model(session) + predict_position_udf = register_udf_for_prediction(predictor, session ,dbt) + + # Retrieve the data, and perform the prediction + hold_out_df = (dbt.ref("hold_out_dataset_for_prediction") + .select(*FEATURE_COLS) + ) + + # Perform prediction. + new_predictions_df = hold_out_df.withColumn("position_predicted" + ,predict_position_udf(*FEATURE_COLS) + ) + + return new_predictions_df + ``` +2. Execute the following in the command bar: + ```bash + dbt run --select predict_position + ``` +3. **Commit and push** our changes to keep saving our work as we go using the commit message `logistic regression model training and application` before moving on. +4. At a high level in this script, we are: + - Retrieving our staged logistic regression model + - Loading the model in + - Placing the model within a user defined function (UDF) to call in line predictions on our driver’s position +5. At a more detailed level: + - Import our libraries. + - Create variables to reference back to the `MODELSTAGE` we just created and stored our model to. + - The temporary file paths we created might look intimidating, but all we’re doing here is programmatically using an initial file path and adding to it to create the following directories: + - LOCAL_TEMP_DIR ➡️ /tmp/driver_position + - DOWNLOAD_DIR ➡️ /tmp/driver_position/download + - TARGET_MODEL_DIR_PATH ➡️ /tmp/driver_position/ml_model + - TARGET_LIB_PATH ➡️ /tmp/driver_position/lib + - Provide a list of our feature columns that we used for model training and will now be used on new data for prediction. + - Next, we are creating our main function `register_udf_for_prediction(p_predictor ,p_session ,p_dbt):`. This function is used to register a user-defined function (UDF) that performs the machine learning prediction. It takes three parameters: `p_predictor` is an instance of the machine learning model, `p_session` is an instance of the Snowflake session, and `p_dbt` is an instance of the dbt library. The function creates a UDF named `predict_churn` which takes a pandas dataframe with the input features and returns a pandas series with the predictions. + - ⚠️ Pay close attention to the whitespace here. We are using a function within a function for this script. + - We have 2 simple functions that are programmatically retrieving our file paths to first get our stored model out of our `MODELSTAGE` and downloaded into the session `download_models_and_libs_from_stage` and then to load the contents of our model in (parameters) in `load_model` to use for prediction. + - Take the model we loaded in and call it `predictor` and wrap it in a UDF. + - Return our dataframe with both the features used to predict and the new label. + +🧠 Another way to read this script is from the bottom up. This can help us progressively see what is going into our final dbt model and work backwards to see how the other functions are being referenced. + +6. Let’s take a look at our predicted position alongside our feature variables. Open a new scratchpad and use the following query. I chose to order by the prediction of who would obtain a podium position: + ```sql + select * from {{ ref('predict_position') }} order by position_predicted + ``` +7. We can see that we created predictions in our final dataset, we are ready to move on to testing! diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md new file mode 100644 index 00000000000..bcda9a775fb --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/13-testing.md @@ -0,0 +1,136 @@ +--- +title: "Testing" +id: "13-testing" +description: "Testing" +--- +We have now completed building all the models for today’s lab, but how do we know if they meet our assertions? Put another way, how do we know the quality of our data models are any good? This brings us to testing! + +We test data models for mainly two reasons: + +- Ensure that our source data is clean on ingestion before we start data modeling/transformation (aka avoid garbage in, garbage out problem). +- Make sure we don’t introduce bugs in the transformation code we wrote (stop ourselves from creating bad joins/fanouts). + +Testing in dbt comes in two flavors: [generic](/docs/build/tests#generic-tests) and [singular](/docs/build/tests#singular-tests). + +You define them in a test block (similar to a macro) and once defined, you can reference them by name in your `.yml` files (applying them to models, columns, sources, snapshots, and seeds). + +You might be wondering: *what about testing Python models?* + +Since the output of our Python models are tables, we can test SQL and Python models the same way! We don’t have to worry about any syntax differences when testing SQL versus Python data models. This means we use `.yml` and `.sql` files to test our entities (tables, views, etc.). Under the hood, dbt is running an SQL query on our tables to see if they meet assertions. If no rows are returned, dbt will surface a passed test. Conversely, if a test results in returned rows, it will fail or warn depending on the configuration (more on that later). + +## Generic tests + +1. To implement generic out-of-the-box tests dbt comes with, we can use YAML files to specify information about our models. To add generic tests to our aggregates model, create a file called `aggregates.yml`, copy the code block below into the file, and save. + + + ```yaml + version: 2 + + models: + - name: fastest_pit_stops_by_constructor + description: Use the python .describe() method to retrieve summary statistics table about pit stops by constructor. Sort by average stop time ascending so the first row returns the fastest constructor. + columns: + - name: constructor_name + description: team that makes the car + tests: + - unique + + - name: lap_times_moving_avg + description: Use the python .rolling() method to calculate the 5 year rolling average of pit stop times alongside the average for each year. + columns: + - name: race_year + description: year of the race + tests: + - relationships: + to: ref('int_lap_times_years') + field: race_year + ``` + +2. Let’s unpack the code we have here. We have both our aggregates models with the model name to know the object we are referencing and the description of the model that we’ll populate in our documentation. At the column level (a level below our model), we are providing the column name followed by our tests. We want to ensure our `constructor_name` is unique since we used a pandas `groupby` on `constructor_name` in the model `fastest_pit_stops_by_constructor`. Next, we want to ensure our `race_year` has referential integrity from the model we selected from `int_lap_times_years` into our subsequent `lap_times_moving_avg` model. +3. Finally, if we want to see how tests were deployed on sources and SQL models, we can look at other files in our project such as the `f1_sources.yml` we created in our Sources and staging section. + +## Using macros for testing + +1. Under your `macros` folder, create a new file and name it `test_all_values_gte_zero.sql`. Copy the code block below and save the file. For clarity, “gte” is an abbreviation for greater than or equal to. + + + ```sql + {% macro test_all_values_gte_zero(table, column) %} + + select * from {{ ref(table) }} where {{ column }} < 0 + + {% endmacro %} + ``` + +2. Macros in Jinja are pieces of code that can be reused multiple times in our SQL models — they are analogous to "functions" in other programming languages, and are extremely useful if you find yourself repeating code across multiple models. +3. We use the `{% macro %}` to indicate the start of the macro and `{% endmacro %}` for the end. The text after the beginning of the macro block is the name we are giving the macro to later call it. In this case, our macro is called `test_all_values_gte_zero`. Macros take in *arguments* to pass through, in this case the `table` and the `column`. In the body of the macro, we see an SQL statement that is using the `ref` function to dynamically select the table and then the column. You can always view macros without having to run them by using `dbt run-operation`. You can learn more [here](https://docs.getdbt.com/reference/commands/run-operation). +4. Great, now we want to reference this macro as a test! Let’s create a new test file called `macro_pit_stops_mean_is_positive.sql` in our `tests` folder. + + + +5. Copy the following code into the file and save: + + ```sql + {{ + config( + enabled=true, + severity='warn', + tags = ['bi'] + ) + }} + + {{ test_all_values_gte_zero('fastest_pit_stops_by_constructor', 'mean') }} + ``` + +6. In our testing file, we are applying some configurations to the test including `enabled`, which is an optional configuration for disabling models, seeds, snapshots, and tests. Our severity is set to `warn` instead of `error`, which means our pipeline will still continue to run. We have tagged our test with `bi` since we are applying this test to one of our bi models. + +Then, in our final line, we are calling the `test_all_values_gte_zero` macro that takes in our table and column arguments and inputting our table `'fastest_pit_stops_by_constructor'` and the column `'mean'`. + +## Custom singular tests to validate Python models + +The simplest way to define a test is by writing the exact SQL that will return failing records. We call these "singular" tests, because they're one-off assertions usable for a single purpose. + +These tests are defined in `.sql` files, typically in your `tests` directory (as defined by your test-paths config). You can use Jinja in SQL models (including ref and source) in the test definition, just like you can when creating models. Each `.sql` file contains one select statement, and it defines one test. + +Let’s add a custom test that asserts that the moving average of the lap time over the last 5 years is greater than zero (it’s impossible to have time less than 0!). It is easy to assume if this is not the case the data has been corrupted. + +1. Create a file `lap_times_moving_avg_assert_positive_or_null.sql` under the `tests` folder. + + +2. Copy the following code and save the file: + + ```sql + {{ + config( + enabled=true, + severity='error', + tags = ['bi'] + ) + }} + + with lap_times_moving_avg as ( select * from {{ ref('lap_times_moving_avg') }} ) + + select * + from lap_times_moving_avg + where lap_moving_avg_5_years < 0 and lap_moving_avg_5_years is not null + ``` + +## Putting all our tests together + +1. Time to run our tests! Altogether, we have created 4 tests for our 2 Python models: + - `fastest_pit_stops_by_constructor` + - Unique `constructor_name` + - Lap times are greater than 0 or null (to allow for the first leading values in a rolling calculation) + - `lap_times_moving_avg` + - Referential test on `race_year` + - Mean pit stop times are greater than or equal to 0 (no negative time values) +2. To run the tests on both our models, we can use this syntax in the command line to run them both at once, similar to how we did our data splits earlier. + Execute the following in the command bar: + ```bash + dbt test --select fastest_pit_stops_by_constructor lap_times_moving_avg + ``` + + +3. All 4 of our tests passed (yay for clean data)! To understand the SQL being run against each of our tables, we can click into the details of the test. +4. Navigating into the **Details** of the `unique_fastest_pit_stops_by_constructor_name`, we can see that each line `constructor_name` should only have one row. + \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md new file mode 100644 index 00000000000..95ec8ad242f --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation.md @@ -0,0 +1,29 @@ +--- +title: "Documentation" +id: "14-documentation" +description: "Documentation" +--- +When it comes to documentation, dbt brings together both column and model level descriptions that you can provide as well as details from your Snowflake information schema in a static site for consumption by other data team members and stakeholders. + +We are going to revisit 2 areas of our project to understand our documentation: + +- `intermediate.md` file +- `dbt_project.yml` file + +To start, let’s look back at our `intermediate.md` file. We can see that we provided multi-line descriptions for the models in our intermediate models using [docs blocks](/docs/collaborate/documentation#using-docs-blocks). Then we reference these docs blocks in our `.yml` file. Building descriptions with doc blocks in Markdown files gives you the ability to format your descriptions with Markdown and are particularly helpful when building long descriptions, either at the column or model level. In our `dbt_project.yml`, we added `node_colors` at folder levels. + +1. To see all these pieces come together, execute this in the command bar: + ```bash + dbt docs generate + ``` + This will generate the documentation for your project. Click the book button, as shown in the screenshot below to access the docs. + + +2. Go to our project area and view `int_results`. View the description that we created in our doc block. + + +3. View the mini-lineage that looks at the model we are currently selected on (`int_results` in this case). + + +4. In our `dbt_project.yml`, we configured `node_colors` depending on the file directory. Starting in dbt v1.3, we can see how our lineage in our docs looks. By color coding your project, it can help you cluster together similar models or steps and more easily troubleshoot. + \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md new file mode 100644 index 00000000000..d9cedb60861 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment.md @@ -0,0 +1,50 @@ +--- +title: "Deployment" +id: "15-deployment" +description: "Deployment" +--- + +Before we jump into deploying our code, let's have a quick primer on environments. Up to this point, all of the work we've done in the dbt Cloud IDE has been in our development environment, with code committed to a feature branch and the models we've built created in our development schema in Snowflake as defined in our Development environment connection. Doing this work on a feature branch, allows us to separate our code from what other coworkers are building and code that is already deemed production ready. Building models in a development schema in Snowflake allows us to separate the database objects we might still be modifying and testing from the database objects running production dashboards or other downstream dependencies. Together, the combination of a Git branch and Snowflake database objects form our environment. + +Now that we've completed testing and documenting our work, we're ready to deploy our code from our development environment to our production environment and this involves two steps: + +- Promoting code from our feature branch to the production branch in our repository. + - Generally, the production branch is going to be named your main branch and there's a review process to go through before merging code to the main branch of a repository. Here we are going to merge without review for ease of this workshop. +- Deploying code to our production environment. + - Once our code is merged to the main branch, we'll need to run dbt in our production environment to build all of our models and run all of our tests. This will allow us to build production-ready objects into our production environment in Snowflake. Luckily for us, the Partner Connect flow has already created our deployment environment and job to facilitate this step. + +1. Before getting started, let's make sure that we've committed all of our work to our feature branch. If you still have work to commit, you'll be able to select the **Commit and push**, provide a message, and then select **Commit** again. +2. Once all of your work is committed, the git workflow button will now appear as **Merge to main**. Select **Merge to main** and the merge process will automatically run in the background. + + +3. When it's completed, you should see the git button read **Create branch** and the branch you're currently looking at will become **main**. +4. Now that all of our development work has been merged to the main branch, we can build our deployment job. Given that our production environment and production job were created automatically for us through Partner Connect, all we need to do here is update some default configurations to meet our needs. +5. In the menu, select **Deploy** **> Environments** + + +6. You should see two environments listed and you'll want to select the **Deployment** environment then **Settings** to modify it. +7. Before making any changes, let's touch on what is defined within this environment. The Snowflake connection shows the credentials that dbt Cloud is using for this environment and in our case they are the same as what was created for us through Partner Connect. Our deployment job will build in our `PC_DBT_DB` database and use the default Partner Connect role and warehouse to do so. The deployment credentials section also uses the info that was created in our Partner Connect job to create the credential connection. However, it is using the same default schema that we've been using as the schema for our development environment. +8. Let's update the schema to create a new schema specifically for our production environment. Click **Edit** to allow you to modify the existing field values. Navigate to **Deployment Credentials >** **schema.** +9. Update the schema name to **production**. Remember to select **Save** after you've made the change. + +10. By updating the schema for our production environment to **production**, it ensures that our deployment job for this environment will build our dbt models in the **production** schema within the `PC_DBT_DB` database as defined in the Snowflake Connection section. +11. Now let's switch over to our production job. Click on the deploy tab again and then select **Jobs**. You should see an existing and preconfigured **Partner Connect Trial Job**. Similar to the environment, click on the job, then select **Settings** to modify it. Let's take a look at the job to understand it before making changes. + + - The Environment section is what connects this job with the environment we want it to run in. This job is already defaulted to use the Deployment environment that we just updated and the rest of the settings we can keep as is. + - The Execution settings section gives us the option to generate docs, run source freshness, and defer to a previous run state. For the purposes of our lab, we're going to keep these settings as is as well and stick with just generating docs. + - The Commands section is where we specify exactly which commands we want to run during this job, and we also want to keep this as is. We want our seed to be uploaded first, then run our models, and finally test them. The order of this is important as well, considering that we need our seed to be created before we can run our incremental model, and we need our models to be created before we can test them. + - Finally, we have the Triggers section, where we have a number of different options for scheduling our job. Given that our data isn't updating regularly here and we're running this job manually for now, we're also going to leave this section alone. + + So, what are we changing then? Just the name! Click **Edit** to allow you to make changes. Then update the name of the job to **Production Job** to denote this as our production deployment job. After that's done, click **Save**. +12. Now let's go to run our job. Clicking on the job name in the path at the top of the screen will take you back to the job run history page where you'll be able to click **Run run** to kick off the job. If you encounter any job failures, try running the job again before further troubleshooting. + + + +13. Let's go over to Snowflake to confirm that everything built as expected in our production schema. Refresh the database objects in your Snowflake account and you should see the production schema now within our default Partner Connect database. If you click into the schema and everything ran successfully, you should be able to see all of the models we developed. + + +## Conclusion + +Fantastic! You’ve finished the workshop! We hope you feel empowered in using both SQL and Python in your dbt Cloud workflows with Snowflake. Having a reliable pipeline to surface both analytics and machine learning is crucial to creating tangible business value from your data. + +For more help and information join our [dbt community Slack](https://www.getdbt.com/community/) which contains more than 50,000 data practitioners today. We have a dedicated slack channel #db-snowflake to Snowflake related content. Happy dbt'ing! \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md new file mode 100644 index 00000000000..e864c363a44 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration.md @@ -0,0 +1,27 @@ +--- +title: "Configure Snowflake" +id: "2-snowflake-configuration" +description: "Configure Snowflake" +--- + + +1. Log in to your trial Snowflake account. You can [sign up for a Snowflake Trial Account using this form](https://signup.snowflake.com/) if you don’t have one. +2. Ensure that your account is set up using **AWS** in the **US East (N. Virginia)**. We will be copying the data from a public AWS S3 bucket hosted by dbt Labs in the us-east-1 region. By ensuring our Snowflake environment setup matches our bucket region, we avoid any multi-region data copy and retrieval latency issues. + + + +3. After creating your account and verifying it from your sign-up email, Snowflake will direct you back to the UI called Snowsight. + +4. When Snowsight first opens, your window should look like the following, with you logged in as the ACCOUNTADMIN with demo worksheets open: + + + + +5. Navigate to **Admin > Billing & Terms**. Click **Enable > Acknowledge & Continue** to enable Anaconda Python Packages to run in Snowflake. + + + + + +6. Finally, create a new Worksheet by selecting **+ Worksheet** in the upper right corner. + diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md new file mode 100644 index 00000000000..9a41e7f45c5 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source.md @@ -0,0 +1,192 @@ +--- +title: "Connect to data source" +id: "3-connect-to-data-source" +description: "Connect to data source" +--- + +We need to obtain our data source by copying our Formula 1 data into Snowflake tables from a public S3 bucket that dbt Labs hosts. + +1. When a new Snowflake account is created, there should be a preconfigured warehouse in your account named `COMPUTE_WH`. +2. If for any reason your account doesn’t have this warehouse, we can create a warehouse using the following script: + + ```sql + create or replace warehouse COMPUTE_WH with warehouse_size=XSMALL + ``` +3. Rename the worksheet to `data setup script` since we will be placing code in this worksheet to ingest the Formula 1 data. Make sure you are still logged in as the **ACCOUNTADMIN** and select the **COMPUTE_WH** warehouse. + + + +4. Copy the following code into the main body of the Snowflake worksheet. You can also find this setup script under the `setup` folder in the [Git repository](https://github.com/dbt-labs/python-snowpark-formula1/blob/main/setup/setup_script_s3_to_snowflake.sql). The script is long since it's bring in all of the data we'll need today! + + ```sql + -- create and define our formula1 database + create or replace database formula1; + use database formula1; + create or replace schema raw; + use schema raw; + + -- define our file format for reading in the csvs + create or replace file format csvformat + type = csv + field_delimiter =',' + field_optionally_enclosed_by = '"', + skip_header=1; + + -- + create or replace stage formula1_stage + file_format = csvformat + url = 's3://formula1-dbt-cloud-python-demo/formula1-kaggle-data/'; + + -- load in the 8 tables we need for our demo + -- we are first creating the table then copying our data in from s3 + -- think of this as an empty container or shell that we are then filling + create or replace table formula1.raw.circuits ( + CIRCUITID NUMBER(38,0), + CIRCUITREF VARCHAR(16777216), + NAME VARCHAR(16777216), + LOCATION VARCHAR(16777216), + COUNTRY VARCHAR(16777216), + LAT FLOAT, + LNG FLOAT, + ALT NUMBER(38,0), + URL VARCHAR(16777216) + ); + -- copy our data from public s3 bucket into our tables + copy into circuits + from @formula1_stage/circuits.csv + on_error='continue'; + + create or replace table formula1.raw.constructors ( + CONSTRUCTORID NUMBER(38,0), + CONSTRUCTORREF VARCHAR(16777216), + NAME VARCHAR(16777216), + NATIONALITY VARCHAR(16777216), + URL VARCHAR(16777216) + ); + copy into constructors + from @formula1_stage/constructors.csv + on_error='continue'; + + create or replace table formula1.raw.drivers ( + DRIVERID NUMBER(38,0), + DRIVERREF VARCHAR(16777216), + NUMBER VARCHAR(16777216), + CODE VARCHAR(16777216), + FORENAME VARCHAR(16777216), + SURNAME VARCHAR(16777216), + DOB DATE, + NATIONALITY VARCHAR(16777216), + URL VARCHAR(16777216) + ); + copy into drivers + from @formula1_stage/drivers.csv + on_error='continue'; + + create or replace table formula1.raw.lap_times ( + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + LAP NUMBER(38,0), + POSITION FLOAT, + TIME VARCHAR(16777216), + MILLISECONDS NUMBER(38,0) + ); + copy into lap_times + from @formula1_stage/lap_times.csv + on_error='continue'; + + create or replace table formula1.raw.pit_stops ( + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + STOP NUMBER(38,0), + LAP NUMBER(38,0), + TIME VARCHAR(16777216), + DURATION VARCHAR(16777216), + MILLISECONDS NUMBER(38,0) + ); + copy into pit_stops + from @formula1_stage/pit_stops.csv + on_error='continue'; + + create or replace table formula1.raw.races ( + RACEID NUMBER(38,0), + YEAR NUMBER(38,0), + ROUND NUMBER(38,0), + CIRCUITID NUMBER(38,0), + NAME VARCHAR(16777216), + DATE DATE, + TIME VARCHAR(16777216), + URL VARCHAR(16777216), + FP1_DATE VARCHAR(16777216), + FP1_TIME VARCHAR(16777216), + FP2_DATE VARCHAR(16777216), + FP2_TIME VARCHAR(16777216), + FP3_DATE VARCHAR(16777216), + FP3_TIME VARCHAR(16777216), + QUALI_DATE VARCHAR(16777216), + QUALI_TIME VARCHAR(16777216), + SPRINT_DATE VARCHAR(16777216), + SPRINT_TIME VARCHAR(16777216) + ); + copy into races + from @formula1_stage/races.csv + on_error='continue'; + + create or replace table formula1.raw.results ( + RESULTID NUMBER(38,0), + RACEID NUMBER(38,0), + DRIVERID NUMBER(38,0), + CONSTRUCTORID NUMBER(38,0), + NUMBER NUMBER(38,0), + GRID NUMBER(38,0), + POSITION FLOAT, + POSITIONTEXT VARCHAR(16777216), + POSITIONORDER NUMBER(38,0), + POINTS NUMBER(38,0), + LAPS NUMBER(38,0), + TIME VARCHAR(16777216), + MILLISECONDS NUMBER(38,0), + FASTESTLAP NUMBER(38,0), + RANK NUMBER(38,0), + FASTESTLAPTIME VARCHAR(16777216), + FASTESTLAPSPEED FLOAT, + STATUSID NUMBER(38,0) + ); + copy into results + from @formula1_stage/results.csv + on_error='continue'; + + create or replace table formula1.raw.status ( + STATUSID NUMBER(38,0), + STATUS VARCHAR(16777216) + ); + copy into status + from @formula1_stage/status.csv + on_error='continue'; + + ``` +5. Ensure all the commands are selected before running the query — an easy way to do this is to use Ctrl-a to highlight all of the code in the worksheet. Select **run** (blue triangle icon). Notice how the dot next to your **COMPUTE_WH** turns from gray to green as you run the query. The **status** table is the final table of all 8 tables loaded in. + + + +6. Let’s unpack that pretty long query we ran into component parts. We ran this query to load in our 8 Formula 1 tables from a public S3 bucket. To do this, we: + - Created a new database called `formula1` and a schema called `raw` to place our raw (untransformed) data into. + - Defined our file format for our CSV files. Importantly, here we use a parameter called `field_optionally_enclosed_by =` since the string columns in our Formula 1 csv files use quotes. Quotes are used around string values to avoid parsing issues where commas `,` and new lines `/n` in data values could cause data loading errors. + - Created a stage to locate our data we are going to load in. Snowflake Stages are locations where data files are stored. Stages are used to both load and unload data to and from Snowflake locations. Here we are using an external stage, by referencing an S3 bucket. + - Created our tables for our data to be copied into. These are empty tables with the column name and data type. Think of this as creating an empty container that the data will then fill into. + - Used the `copy into` statement for each of our tables. We reference our staged location we created and upon loading errors continue to load in the rest of the data. You should not have data loading errors but if you do, those rows will be skipped and Snowflake will tell you which rows caused errors + +7. Now let's take a look at some of our cool Formula 1 data we just loaded up! + 1. Create a new worksheet by selecting the **+** then **New Worksheet**. + + 2. Navigate to **Database > Formula1 > RAW > Tables**. + 3. Query the data using the following code. There are only 76 rows in the circuits table, so we don’t need to worry about limiting the amount of data we query. + ```sql + select * from formula1.raw.circuits + ``` + 4. Run the query. From here on out, we’ll use the keyboard shortcuts Command-Enter or Control-Enter to run queries and won’t explicitly call out this step. + 5. Review the query results, you should see information about Formula 1 circuits, starting with Albert Park in Australia! + 6. Finally, ensure you have all 8 tables starting with `CIRCUITS` and ending with `STATUS`. Now we are ready to connect into dbt Cloud! + + + + \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md new file mode 100644 index 00000000000..21eaa7e8d7f --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt.md @@ -0,0 +1,27 @@ +--- +title: "Configure dbt" +id: "4-configure-dbt" +description: "Configure dbt" +--- + +1. We are going to be using [Snowflake Partner Connect](https://docs.snowflake.com/en/user-guide/ecosystem-partner-connect.html) to set up a dbt Cloud account. Using this method will allow you to spin up a fully fledged dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [managed repository](/docs/collaborate/git/managed-repository), environments, and credentials already established. +2. Navigate out of your worksheet back by selecting **home**. +3. In Snowsight, confirm that you are using the **ACCOUNTADMIN** role. +4. Navigate to the **Admin** **> Partner Connect**. Find **dbt** either by using the search bar or navigating the **Data Integration**. Select the **dbt** tile. + +5. You should now see a new window that says **Connect to dbt**. Select **Optional Grant** and add the `FORMULA1` database. This will grant access for your new dbt user role to the FORMULA1 database. + + +6. Ensure the `FORMULA1` is present in your optional grant before clicking **Connect**.  This will create a dedicated dbt user, database, warehouse, and role for your dbt Cloud trial. + + + +7. When you see the **Your partner account has been created** window, click **Activate**. + +8. You should be redirected to a dbt Cloud registration page. Fill out the form. Make sure to save the password somewhere for login in the future. + + + +9. Select **Complete Registration**. You should now be redirected to your dbt Cloud account, complete with a connection to your Snowflake account, a deployment and a development environment, and a sample job. + +10. To help you version control your dbt project, we have connected it to a [managed repository](/docs/collaborate/git/managed-repository), which means that dbt Labs will be hosting your repository for you. This will give you access to a Git workflow without you having to create and host the repository yourself. You will not need to know Git for this workshop; dbt Cloud will help guide you through the workflow. In the future, when you’re developing your own project, [feel free to use your own repository](/docs/cloud/git/connect-github). This will allow you to learn more about features like [Slim CI](/docs/deploy/continuous-integration) builds after this workshop. diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md new file mode 100644 index 00000000000..f098c47bdad --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name.md @@ -0,0 +1,46 @@ +--- +title: "Development schema name and IDE walkthrough" +id: "5-development-schema-name" +description: "Development schema name and IDE walkthrough" +--- + +1. First we are going to change the name of our default schema to where our dbt models will build. By default, the name is `dbt_`. We will change this to `dbt_` to create your own personal development schema. To do this, select **Profile Settings** from the gear icon in the upper right. + + + +2. Navigate to the **Credentials** menu and select **Partner Connect Trial**, which will expand the credentials menu. + + + +3. Click **Edit** and change the name of your schema from `dbt_` to `dbt_YOUR_NAME` replacing `YOUR_NAME` with your initials and name (`hwatson` is used in the lab screenshots). Be sure to click **Save** for your changes! + + +4. We now have our own personal development schema, amazing! When we run our first dbt models they will build into this schema. +5. Let’s open up dbt Cloud’s Integrated Development Environment (IDE) and familiarize ourselves. Choose **Develop** at the top of the UI. + +6. When the IDE is done loading, click **Initialize dbt project**. The initialization process creates a collection of files and folders necessary to run your dbt project. + + +7. After the initialization is finished, you can view the files and folders in the file tree menu. As we move through the workshop we'll be sure to touch on a few key files and folders that we'll work with to build out our project. +8. Next click **Commit and push** to commit the new files and folders from the initialize step. We always want our commit messages to be relevant to the work we're committing, so be sure to provide a message like `initialize project` and select **Commit Changes**. + + + + + +9. [Committing](https://www.atlassian.com/git/tutorials/saving-changes/git-commit) your work here will save it to the managed git repository that was created during the Partner Connect signup. This initial commit is the only commit that will be made directly to our `main` branch and from *here on out we'll be doing all of our work on a development branch*. This allows us to keep our development work separate from our production code. +10. There are a couple of key features to point out about the IDE before we get to work. It is a text editor, an SQL and Python runner, and a CLI with Git version control all baked into one package! This allows you to focus on editing your SQL and Python files, previewing the results with the SQL runner (it even runs Jinja!), and building models at the command line without having to move between different applications. The Git workflow in dbt Cloud allows both Git beginners and experts alike to be able to easily version control all of their work with a couple clicks. + + + +11. Let's run our first dbt models! Two example models are included in your dbt project in the `models/examples` folder that we can use to illustrate how to run dbt at the command line. Type `dbt run` into the command line and click **Enter** on your keyboard. When the run bar expands you'll be able to see the results of the run, where you should see the run complete successfully. + + + +12. The run results allow you to see the code that dbt compiles and sends to Snowflake for execution. To view the logs for this run, select one of the model tabs using the  **>** icon and then **Details**. If you scroll down a bit you'll be able to see the compiled code and how dbt interacts with Snowflake. Given that this run took place in our development environment, the models were created in your development schema. + + + + +13. Now let's switch over to Snowflake to confirm that the objects were actually created. Click on the three dots **…** above your database objects and then **Refresh**. Expand the **PC_DBT_DB** database and you should see your development schema. Select the schema, then **Tables**  and **Views**. Now you should be able to see `MY_FIRST_DBT_MODEL` as a table and `MY_SECOND_DBT_MODEL` as a view. + \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md new file mode 100644 index 00000000000..e387b208dd1 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/6-foundational-structure.md @@ -0,0 +1,80 @@ +--- +title: "Foundational structure" +id: "6-foundational-structure" +description: "Foundational structure" +--- + +In this step, we’ll need to create a development branch and set up project level configurations. + +1. To get started with development for our project, we'll need to create a new Git branch for our work. Select **create branch** and name your development branch. We'll call our branch `snowpark_python_workshop` then click **Submit**. +2. The first piece of development we'll do on the project is to update the `dbt_project.yml` file. Every dbt project requires a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. The [dbt_project.yml](/reference/dbt_project.yml) file also contains important information that tells dbt how to operate on your project. +3. Select the `dbt_project.yml` file from the file tree to open it and replace all of the existing contents with the following code below. When you're done, save the file by clicking **save**. You can also use the Command-S or Control-S shortcut from here on out. + + ```yaml + # Name your project! Project names should contain only lowercase characters + # and underscores. A good package name should reflect your organization's + # name or the intended use of these models + name: 'snowflake_dbt_python_formula1' + version: '1.3.0' + require-dbt-version: '>=1.3.0' + config-version: 2 + + # This setting configures which "profile" dbt uses for this project. + profile: 'default' + + # These configurations specify where dbt should look for different types of files. + # The `model-paths` config, for example, states that models in this project can be + # found in the "models/" directory. You probably won't need to change these! + model-paths: ["models"] + analysis-paths: ["analyses"] + test-paths: ["tests"] + seed-paths: ["seeds"] + macro-paths: ["macros"] + snapshot-paths: ["snapshots"] + + target-path: "target" # directory which will store compiled SQL files + clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + + models: + snowflake_dbt_python_formula1: + staging: + + +docs: + node_color: "CadetBlue" + marts: + +materialized: table + aggregates: + +docs: + node_color: "Maroon" + +tags: "bi" + + core: + +docs: + node_color: "#800080" + intermediate: + +docs: + node_color: "MediumSlateBlue" + ml: + prep: + +docs: + node_color: "Indigo" + train_predict: + +docs: + node_color: "#36454f" + + ``` + +4. The key configurations to point out in the file with relation to the work that we're going to do are in the `models` section. + - `require-dbt-version` — Tells dbt which version of dbt to use for your project. We are requiring 1.3.0 and any newer version to run python models and node colors. + - `materialized` — Tells dbt how to materialize models when compiling the code before it pushes it down to Snowflake. All models in the `marts` folder will be built as tables. + - `tags` — Applies tags at a directory level to all models. All models in the `aggregates` folder will be tagged as `bi` (abbreviation for business intelligence). + - `docs` — Specifies the `node_color` either by the plain color name or a hex value. +5. [Materializations](/docs/build/materializations) are strategies for persisting dbt models in a warehouse, with `tables` and `views` being the most commonly utilized types. By default, all dbt models are materialized as views and other materialization types can be configured in the `dbt_project.yml` file or in a model itself. It’s very important to note *Python models can only be materialized as tables or incremental models.* Since all our Python models exist under `marts`, the following portion of our `dbt_project.yml` ensures no errors will occur when we run our Python models. Starting with [dbt version 1.4](/guides/migration/versions/upgrading-to-v1.4#updates-to-python-models), Python files will automatically get materialized as tables even if not explicitly specified. + + ```yaml + marts:     + +materialized: table + ``` + diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md new file mode 100644 index 00000000000..a47a3b54d48 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure.md @@ -0,0 +1,27 @@ +--- +title: "Folder structure" +id: "7-folder-structure" +description: "Folder structure" +--- +dbt Labs has developed a [project structure guide](/guides/best-practices/how-we-structure/1-guide-overview/) that contains a number of recommendations for how to build the folder structure for your project. Do check out that guide if you want to learn more. Right now we are going to create some folders to organize our files: + +- Sources — This is our Formula 1 dataset and it will be defined in a source YAML file. +- Staging models — These models have a 1:1 with their source table. +- Intermediate — This is where we will be joining some Formula staging models. +- Marts models — Here is where we perform our major transformations. It contains these subfolders: + - aggregates + - core + - ml +1. In your file tree, use your cursor and hover over the `models` subdirectory, click the three dots **…** that appear to the right of the folder name, then select **Create Folder**. We're going to add two new folders to the file path, `staging` and `formula1` (in that order) by typing `staging/formula1` into the file path. + + + + + - If you click into your `models` directory now, you should see the new `staging` folder nested within `models` and the `formula1` folder nested within `staging`. +2. Create two additional folders the same as the last step. Within the `models` subdirectory, create new directories `marts/core`. + +3. We will need to create a few more folders and subfolders using the UI. After you create all the necessary folders, your folder tree should look like this when it's all done: + + + +Remember you can always reference the entire project in [GitHub](https://github.com/dbt-labs/python-snowpark-formula1/tree/python-formula1) to view the complete folder and file strucutre. \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md new file mode 100644 index 00000000000..22e49c8a30b --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging.md @@ -0,0 +1,334 @@ +--- +title: "Sources and staging" +id: "8-sources-and-staging" +description: "Sources and staging" +--- + +In this section, we are going to create our source and staging models. + +Sources allow us to create a dependency between our source database object and our staging models which will help us when we look at later. Also, if your source changes database or schema, you only have to update it in your `f1_sources.yml` file rather than updating all of the models it might be used in. + +Staging models are the base of our project, where we bring all the individual components we're going to use to build our more complex and useful models into the project. + +Since we want to focus on dbt and Python in this workshop, check out our [sources](/docs/build/sources) and [staging](/guides/best-practices/how-we-structure/2-staging) docs if you want to learn more (or take our [dbt Fundamentals](https://courses.getdbt.com/collections) course which covers all of our core functionality). + +## Create sources + +We're going to be using each of our 8 Formula 1 tables from our `formula1` database under the `raw`  schema for our transformations and we want to create those tables as sources in our project. + +1. Create a new file called `f1_sources.yml` with the following file path: `models/staging/formula1/f1_sources.yml`. +2. Then, paste the following code into the file before saving it: + +```yaml +version: 2 + +sources: + - name: formula1 + description: formula 1 datasets with normalized tables + database: formula1 + schema: raw + tables: + - name: circuits + description: One record per circuit, which is the specific race course. + columns: + - name: circuitid + tests: + - unique + - not_null + - name: constructors + description: One record per constructor. Constructors are the teams that build their formula 1 cars. + columns: + - name: constructorid + tests: + - unique + - not_null + - name: drivers + description: One record per driver. This table gives details about the driver. + columns: + - name: driverid + tests: + - unique + - not_null + - name: lap_times + description: One row per lap in each race. Lap times started being recorded in this dataset in 1984 and joined through driver_id. + - name: pit_stops + description: One row per pit stop. Pit stops do not have their own id column, the combination of the race_id and driver_id identify the pit stop. + columns: + - name: stop + tests: + - accepted_values: + values: [1,2,3,4,5,6,7,8] + quote: false + - name: races + description: One race per row. Importantly this table contains the race year to understand trends. + columns: + - name: raceid + tests: + - unique + - not_null + - name: results + columns: + - name: resultid + tests: + - unique + - not_null + description: One row per result. The main table that we join out for grid and position variables. + - name: status + description: One status per row. The status contextualizes whether the race was finished or what issues arose e.g. collisions, engine, etc. + columns: + - name: statusid + tests: + - unique + - not_null +``` + +## Create staging models + +The next step is to set up the staging models for each of the 8 source tables. Given the one-to-one relationship between staging models and their corresponding source tables, we'll build 8 staging models here. We know it’s a lot and in the future, we will seek to update the workshop to make this step less repetitive and more efficient. This step is also a good representation of the real world of data, where you have multiple hierarchical tables that you will need to join together! + +1. Let's go in alphabetical order to easily keep track of all our staging models! Create a new file called `stg_f1_circuits.sql` with this file path `models/staging/formula1/stg_f1_circuits.sql`. Then, paste the following code into the file before saving it: + + ```sql + with + + source as ( + + select * from {{ source('formula1','circuits') }} + + ), + + renamed as ( + select + circuitid as circuit_id, + circuitref as circuit_ref, + name as circuit_name, + location, + country, + lat as latitude, + lng as longitude, + alt as altitude + -- omit the url + from source + ) + select * from renamed + ``` + + All we're doing here is pulling the source data into the model using the `source` function, renaming some columns, and omitting the column `url` with a commented note since we don’t need it for our analysis. + +1. Create `stg_f1_constructors.sql` with this file path `models/staging/formula1/stg_f1_constructors.sql`. Paste the following code into it before saving the file: + + ```sql + with + + source as ( + + select * from {{ source('formula1','constructors') }} + + ), + + renamed as ( + select + constructorid as constructor_id, + constructorref as constructor_ref, + name as constructor_name, + nationality as constructor_nationality + -- omit the url + from source + ) + + select * from renamed + ``` + + We have 6 other stages models to create. We can do this by creating new files, then copy and paste the code into our `staging` folder. + +1. Create `stg_f1_drivers.sql` with this file path `models/staging/formula1/stg_f1_drivers.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','drivers') }} + + ), + + renamed as ( + select + driverid as driver_id, + driverref as driver_ref, + number as driver_number, + code as driver_code, + forename, + surname, + dob as date_of_birth, + nationality as driver_nationality + -- omit the url + from source + ) + + select * from renamed + ``` +1. Create `stg_f1_lap_times.sql` with this file path `models/staging/formula1/stg_f1_lap_times.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','lap_times') }} + + ), + + renamed as ( + select + raceid as race_id, + driverid as driver_id, + lap, + position, + time as lap_time_formatted, + milliseconds as lap_time_milliseconds + from source + ) + + select * from renamed + ``` +1. Create `stg_f1_pit_stops.sql` with this file path `models/staging/formula1/stg_f1_pit_stops.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','pit_stops') }} + + ), + + renamed as ( + select + raceid as race_id, + driverid as driver_id, + stop as stop_number, + lap, + time as lap_time_formatted, + duration as pit_stop_duration_seconds, + milliseconds as pit_stop_milliseconds + from source + ) + + select * from renamed + order by pit_stop_duration_seconds desc + ``` + +1. Create ` stg_f1_races.sql` with this file path `models/staging/formula1/stg_f1_races.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','races') }} + + ), + + renamed as ( + select + raceid as race_id, + year as race_year, + round as race_round, + circuitid as circuit_id, + name as circuit_name, + date as race_date, + to_time(time) as race_time, + -- omit the url + fp1_date as free_practice_1_date, + fp1_time as free_practice_1_time, + fp2_date as free_practice_2_date, + fp2_time as free_practice_2_time, + fp3_date as free_practice_3_date, + fp3_time as free_practice_3_time, + quali_date as qualifying_date, + quali_time as qualifying_time, + sprint_date, + sprint_time + from source + ) + + select * from renamed + ``` +1. Create `stg_f1_results.sql` with this file path `models/staging/formula1/stg_f1_results.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','results') }} + + ), + + renamed as ( + select + resultid as result_id, + raceid as race_id, + driverid as driver_id, + constructorid as constructor_id, + number as driver_number, + grid, + position::int as position, + positiontext as position_text, + positionorder as position_order, + points, + laps, + time as results_time_formatted, + milliseconds as results_milliseconds, + fastestlap as fastest_lap, + rank as results_rank, + fastestlaptime as fastest_lap_time_formatted, + fastestlapspeed::decimal(6,3) as fastest_lap_speed, + statusid as status_id + from source + ) + + select * from renamed + ``` +1. Last one! Create `stg_f1_status.sql` with this file path: `models/staging/formula1/stg_f1_status.sql`: + + ```sql + with + + source as ( + + select * from {{ source('formula1','status') }} + + ), + + renamed as ( + select + statusid as status_id, + status + from source + ) + + select * from renamed + ``` + After the source and all the staging models are complete for each of the 8 tables, your staging folder should look like this: + + + +1. It’s a good time to delete our example folder since these two models are extraneous to our formula1 pipeline and `my_first_model` fails a `not_null` test that we won’t spend time investigating. dbt Cloud will warn us that this folder will be permanently deleted, and we are okay with that so select **Delete**. + + + +1. Now that the staging models are built and saved, it's time to create the models in our development schema in Snowflake. To do this we're going to enter into the command line `dbt build` to run all of the models in our project, which includes the 8 new staging models and the existing example models. + + Your run should complete successfully and you should see green checkmarks next to all of your models in the run results. We built our 8 staging models as views and ran 13 source tests that we configured in the `f1_sources.yml` file with not that much code, pretty cool! + + + + Let's take a quick look in Snowflake, refresh database objects, open our development schema, and confirm that the new models are there. If you can see them, then we're good to go! + + + + Before we move onto the next section, be sure to commit your new models to your Git branch. Click **Commit and push** and give your commit a message like `profile, sources, and staging setup` before moving on. + + \ No newline at end of file diff --git a/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md new file mode 100644 index 00000000000..262bf0e5e52 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations.md @@ -0,0 +1,299 @@ +--- +title: "SQL transformations" +id: "9-sql-transformations" +description: "SQL transformations" +--- + +Now that we have all our sources and staging models done, it's time to move into where dbt shines — transformation! + +We need to: + +- Create some intermediate tables to join tables that aren’t hierarchical +- Create core tables for business intelligence (BI) tool ingestion +- Answer the two questions about: + - fastest pit stops + - lap time trends about our Formula 1 data by creating aggregate models using python! + +## Intermediate models + +We need to join lots of reference tables to our results table to create a human readable dataframe. What does this mean? For example, we don’t only want to have the numeric `status_id` in our table, we want to be able to read in a row of data that a driver could not finish a race due to engine failure (`status_id=5`). + +By now, we are pretty good at creating new files in the correct directories so we won’t cover this in detail. All intermediate models should be created in the path `models/intermediate`. + +1. Create a new file called `int_lap_times_years.sql`. In this model, we are joining our lap time and race information so we can look at lap times over years. In earlier Formula 1 eras, lap times were not recorded (only final results), so we filter out records where lap times are null. + + ```sql + with lap_times as ( + + select * from {{ ref('stg_f1_lap_times') }} + + ), + + races as ( + + select * from {{ ref('stg_f1_races') }} + + ), + + expanded_lap_times_by_year as ( + select + lap_times.race_id, + driver_id, + race_year, + lap, + lap_time_milliseconds + from lap_times + left join races + on lap_times.race_id = races.race_id + where lap_time_milliseconds is not null + ) + + select * from expanded_lap_times_by_year + ``` + +2. Create a file called `in_pit_stops.sql`. Pit stops are a many-to-one (M:1) relationship with our races. We are creating a feature called `total_pit_stops_per_race` by partitioning over our `race_id` and `driver_id`, while preserving individual level pit stops for rolling average in our next section. + + ```sql + with stg_f1__pit_stops as + ( + select * from {{ ref('stg_f1_pit_stops') }} + ), + + pit_stops_per_race as ( + select + race_id, + driver_id, + stop_number, + lap, + lap_time_formatted, + pit_stop_duration_seconds, + pit_stop_milliseconds, + max(stop_number) over (partition by race_id,driver_id) as total_pit_stops_per_race + from stg_f1__pit_stops + ) + + select * from pit_stops_per_race + ``` + +3. Create a file called `int_results.sql`. Here we are using 4 of our tables — `races`, `drivers`, `constructors`, and `status` — to give context to our `results` table. We are now able to calculate a new feature `drivers_age_years` by bringing the `date_of_birth` and `race_year` into the same table. We are also creating a column to indicate if the driver did not finish (dnf) the race, based upon if their `position` was null called, `dnf_flag`. + + ```sql + with results as ( + + select * from {{ ref('stg_f1_results') }} + + ), + + races as ( + + select * from {{ ref('stg_f1_races') }} + + ), + + drivers as ( + + select * from {{ ref('stg_f1_drivers') }} + + ), + + constructors as ( + + select * from {{ ref('stg_f1_constructors') }} + ), + + status as ( + + select * from {{ ref('stg_f1_status') }} + ), + + int_results as ( + select + result_id, + results.race_id, + race_year, + race_round, + circuit_id, + circuit_name, + race_date, + race_time, + results.driver_id, + results.driver_number, + forename ||' '|| surname as driver, + cast(datediff('year', date_of_birth, race_date) as int) as drivers_age_years, + driver_nationality, + results.constructor_id, + constructor_name, + constructor_nationality, + grid, + position, + position_text, + position_order, + points, + laps, + results_time_formatted, + results_milliseconds, + fastest_lap, + results_rank, + fastest_lap_time_formatted, + fastest_lap_speed, + results.status_id, + status, + case when position is null then 1 else 0 end as dnf_flag + from results + left join races + on results.race_id=races.race_id + left join drivers + on results.driver_id = drivers.driver_id + left join constructors + on results.constructor_id = constructors.constructor_id + left join status + on results.status_id = status.status_id + ) + + select * from int_results + ``` +1. Create a *Markdown* file `intermediate.md` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections. + + ```markdown + # the intent of this .md is to allow for multi-line long form explanations for our intermediate transformations + + # below are descriptions + {% docs int_results %} In this query we want to join out other important information about the race results to have a human readable table about results, races, drivers, constructors, and status. + We will have 4 left joins onto our results table. {% enddocs %} + + {% docs int_pit_stops %} There are many pit stops within one race, aka a M:1 relationship. + We want to aggregate this so we can properly join pit stop information without creating a fanout. {% enddocs %} + + {% docs int_lap_times_years %} Lap times are done per lap. We need to join them out to the race year to understand yearly lap time trends. {% enddocs %} + ``` +1. Create a *YAML* file `intermediate.yml` that we will go over in depth during the [Testing](/guides/dbt-ecosystem/dbt-python-snowpark/13-testing) and [Documentation](/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation) sections. + + ```yaml + version: 2 + + models: + - name: int_results + description: '{{ doc("int_results") }}' + - name: int_pit_stops + description: '{{ doc("int_pit_stops") }}' + - name: int_lap_times_years + description: '{{ doc("int_lap_times_years") }}' + ``` + That wraps up the intermediate models we need to create our core models! + +## Core models + +1. Create a file `fct_results.sql`. This is what I like to refer to as the “mega table” — a really large denormalized table with all our context added in at row level for human readability. Importantly, we have a table `circuits` that is linked through the table `races`. When we joined `races` to `results` in `int_results.sql` we allowed our tables to make the connection from `circuits` to `results` in `fct_results.sql`. We are only taking information about pit stops at the result level so our join would not cause a [fanout](https://community.looker.com/technical-tips-tricks-1021/what-is-a-fanout-23327). + + ```sql + with int_results as ( + + select * from {{ ref('int_results') }} + + ), + + int_pit_stops as ( + select + race_id, + driver_id, + max(total_pit_stops_per_race) as total_pit_stops_per_race + from {{ ref('int_pit_stops') }} + group by 1,2 + ), + + circuits as ( + + select * from {{ ref('stg_f1_circuits') }} + ), + base_results as ( + select + result_id, + int_results.race_id, + race_year, + race_round, + int_results.circuit_id, + int_results.circuit_name, + circuit_ref, + location, + country, + latitude, + longitude, + altitude, + total_pit_stops_per_race, + race_date, + race_time, + int_results.driver_id, + driver, + driver_number, + drivers_age_years, + driver_nationality, + constructor_id, + constructor_name, + constructor_nationality, + grid, + position, + position_text, + position_order, + points, + laps, + results_time_formatted, + results_milliseconds, + fastest_lap, + results_rank, + fastest_lap_time_formatted, + fastest_lap_speed, + status_id, + status, + dnf_flag + from int_results + left join circuits + on int_results.circuit_id=circuits.circuit_id + left join int_pit_stops + on int_results.driver_id=int_pit_stops.driver_id and int_results.race_id=int_pit_stops.race_id + ) + + select * from base_results + ``` + +1. Create the file `pit_stops_joined.sql`. Our results and pit stops are at different levels of dimensionality (also called grain). Simply put, we have multiple pit stops per a result. Since we are interested in understanding information at the pit stop level with information about race year and constructor, we will create a new table `pit_stops_joined.sql` where each row is per pit stop. Our new table tees up our aggregation in Python. + + ```sql + with base_results as ( + + select * from {{ ref('fct_results') }} + + ), + + pit_stops as ( + + select * from {{ ref('int_pit_stops') }} + + ), + + pit_stops_joined as ( + + select + base_results.race_id, + race_year, + base_results.driver_id, + constructor_id, + constructor_name, + stop_number, + lap, + lap_time_formatted, + pit_stop_duration_seconds, + pit_stop_milliseconds + from base_results + left join pit_stops + on base_results.race_id=pit_stops.race_id and base_results.driver_id=pit_stops.driver_id + ) + select * from pit_stops_joined + ``` + +1. Enter in the command line and execute `dbt build` to build out our entire pipeline to up to this point. Don’t worry about “overriding” your previous models – dbt workflows are designed to be idempotent so we can run them again and expect the same results. + +1. Let’s talk about our lineage so far. It’s looking good 😎. We’ve shown how SQL can be used to make data type, column name changes, and handle hierarchical joins really well; all while building out our automated lineage! + + + +1. Time to **Commit and push** our changes and give your commit a message like `intermediate and fact models` before moving on. diff --git a/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md new file mode 100644 index 00000000000..522bcf70c65 --- /dev/null +++ b/website/docs/guides/dbt-ecosystem/sl-partner-integration-guide.md @@ -0,0 +1,147 @@ +--- +title: "dbt Semantic Layer integration best practices" +id: "sl-partner-integration-guide" +description: Learn about partner integration guidelines, roadmap, and connectivity. +--- + +To fit your tool within the world of the Semantic Layer, dbt Labs offers some best practice recommendations for how to expose metrics and allow users to interact with them seamlessly. + +:::note +This is an evolving guide that is meant to provide recommendations based on our experience. If you have any feedback, we'd love to hear it! +::: + + +## Requirements + +To build a dbt Semantic Layer integration: + +- We offer a [JDBC](/docs/dbt-cloud-apis/sl-jdbc) API and [GraphQL API](/docs/dbt-cloud-apis/sl-graphql). Refer to the dedicated [dbt Semantic Layer API](/docs/dbt-cloud-apis/sl-api-overview) for more technical integration details. + +- Familiarize yourself with the [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) and [MetricFlow](/docs/build/about-metricflow)'s key concepts. There are two main objects: + + - [Semantic models](/docs/build/semantic-models) — Nodes in your semantic graph, connected via entities as edges. MetricFlow takes semantic models defined in YAML configuration files as inputs and creates a semantic graph that you can use to query metrics. + - [Metrics](/docs/build/metrics-overview) — Can be defined in the same YAML files as your semantic models, or split into separate YAML files into any other subdirectories (provided that these subdirectories are also within the same dbt project repo). + +### Connection parameters + +The dbt Semantic Layer APIs authenticate with `environmentId`, `SERVICE_TOKEN`, and `host`. + +We recommend you provide users with separate input fields with these components for authentication (dbt Cloud will surface these parameters for the user). + + +## Best practices on exposing metrics + +Best practices for exposing metrics are summarized into five themes: + +- [Governance](#governance-and-traceability) — Recommendations on how to establish guardrails for governed data work. +- [Discoverability](#discoverability) — Recommendations on how to make user-friendly data interactions. +- [Organization](#organization) — Organize metrics and dimensions for all audiences. +- [Query flexibility](#query-flexibility) — Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. +- [Context and interpretation](#context-and-interpretation) — Contextualize metrics for better analysis; expose definitions, metadata, lineage, and freshness. + +### Governance and traceability + +When working with more governed data, it's essential to establish clear guardrails. Here are some recommendations: + +- **Aggregations control** — Users shouldn't generally be allowed to modify aggregations unless they perform post-processing calculations on Semantic Layer data (such as year-over-year analysis). + +- **Time series alignment and using metric_time** — Make sure users view metrics across the correct time series. When displaying metric graphs, using a non-default time aggregation dimension might lead to misleading interpretations. While users can still group by other time dimensions, they should be careful not to create trend lines with incorrect time axes.

          When looking at one or multiple metrics, users should use `metric_time` as the main time dimension to guarantee they are looking at the right time series for the metric(s).

          As such, when building an application, we recommend exposing `metric_time` as a separate, "special" time dimension on its own. This dimension is always going to align with all metrics and be common across them. Other time dimensions can still be looked at and grouped by, but having a clear delineation between the `metric_time` dimension and the other time dimensions is clarifying so that people do not confuse how metrics should be plotted.

          Also, when a user requests a time granularity change for the main time series, the query that your application runs should use `metric_time` as this will always give you the correct slice. Related to this, we also strongly recommend that you have a way to expose what dimension `metric_time` actually maps to for users who may not be familiar. Our APIs allow you to fetch the actual underlying time dimensions that makeup metric_time (such as `transaction_date`) so you can expose them to your users. + +- **Units consistency** — If units are supported, it's vital to avoid plotting data incorrectly with different units. Ensuring consistency in unit representation will prevent confusion and misinterpretation of the data. + +- **Traceability of metric and dimension changes** — When users change names of metrics and dimensions for reports, it's crucial to have a traceability mechanism in place to link back to the original source metric name. + + +### Discoverability + +- Consider treating [metrics](/docs/build/metrics-overview) as first-class objects rather than measures. Metrics offer a higher-level and more contextual way to interact with data, reducing the burden on end-users to manually aggregate data. + +- Easy metric interactions: Provide users with an intuitive approach to: + * Search for Metrics — Users should be able to easily search and find relevant metrics. Metrics can serve as the starting point to lead users into exploring dimensions. + * Search for Dimensions — Users should be able to query metrics with associated dimensions, allowing them to gain deeper insights into the data. + * Filter by Dimension Values — Expose and enable users to filter metrics based on dimension values, encouraging data analysis and exploration. + * Filter additional metadata — Allow users to filter metrics based on other available metadata, such as metric type and default time granularity. + +- Suggested Metrics: Ideally, the system should intelligently suggest relevant metrics to users based on their team's activities. This approach encourages user exposure, facilitates learning, and supports collaboration among team members. + +By implementing these recommendations, the data interaction process becomes more user-friendly, empowering users to gain valuable insights without the need for extensive data manipulation. + +### Organization + +We recommend organizing metrics and dimensions in ways that a non-technical user can understand the data model, without needing much context: + +- **Organizing Dimensions** — To help non-technical users understand the data model better, we recommend organizing dimensions based on the entity they originated from. For example, consider dimensions like `user__country` and `product__category`.

          You can create groups by extracting `user` and `product` and then nest the respective dimensions under each group. This way, dimensions align with the entity or semantic model they belong to and make them more user-friendly and accessible. + +- **Organizing Metrics** — The goal is to organize metrics into a hierarchy in our configurations, instead of presenting them in a long list.

          This hierarchy helps you organize metrics based on specific criteria, such as business unit or team. By providing this structured organization, users can find and navigate metrics more efficiently, enhancing their overall data analysis experience. + +### Query flexibility + +Allow users to query either one metric alone without dimensions or multiple metrics with dimensions. + +- Allow toggling between metrics/dimensions seamlessly. + +- Be clear on exposing what dimensions are queryable with what metrics and hide things that don’t apply, and vice versa. + +- Only expose time granularities (monthly, daily, yearly) that match the available metrics. + * For example, if a dbt model and its resulting semantic model have a monthly granularity, make sure querying data with a 'daily' granularity isn't available to the user. Our APIs have functionality that will help you surface the correct granularities + +- We recommend that time granularity is treated as a general time dimension-specific concept and that it can be applied to more than just the primary aggregation (or `metric_time`). Consider a situation where a user wants to look at `sales` over time by `customer signup month`; in this situation, having the ability to apply granularities to both time dimensions is crucial. Our APIs include information to fetch the granularities for the primary (metric_time) dimensions, as well as all time dimensions. You can treat each time dimension and granularity selection independently in your application. Note: Initially, as a starting point, it makes sense to only support `metric_time` or the primary time dimension, but we recommend expanding that as your solution evolves. + +- You should allow users to filter on date ranges and expose a calendar and nice presets for filtering these. + * For example, last 30 days, last week, and so on. + +### Context and interpretation + +For better analysis, it's best to have the context of the metrics close to where the analysis is happening. We recommend the following: + +- Expose business definitions of the metrics as well as logical definitions. + +- Expose additional metadata from the Semantic layer (measures, type parameters). + +- Use the [Discovery API](/docs/dbt-cloud-apis/discovery-api) to enhance the metric and build confidence in its accuracy: + * Check if the metric is fresh and when it was last updated. + * Include lineage information to understand the metric's origin. + +- Allow for creating other metadata that’s useful for the metric. We can provide some of this information in our configuration (Display name, Default Granularity for View, Default Time range), but there may be other metadata that your tool wants to provide to make the metric richer. + +## Example stages of an integration + +These are recommendations on how to evolve a Semantic Layer integration and not a strict runbook. + +**Stage 1 - The basic** +* Supporting and using [JDBC](/docs/dbt-cloud-apis/sl-jdbc) or [GraphQL](/docs/dbt-cloud-apis/sl-graphql) is the first step. Refer to the [dbt Semantic Layer APIs](/docs/dbt-cloud-apis/sl-api-overview) for more technical details. + +**Stage 2 - More discoverability and basic querying** +* Support listing metrics defined in the project +* Listing available dimensions based on one or many metrics +* Querying defined metric values on their own or grouping by available dimensions +* Display metadata from [Discovery API](/docs/dbt-cloud-apis/discovery-api) and other context + +**Stage 3 - More querying flexibility and better user experience (UX)** +* More advanced filtering + * Time filters with good presets/calendar UX + * Filtering metrics on a pre-populated set of dimension values +* Make dimension values more user-friendly by organizing them effectively +* Intelligent filtering of metrics based on available dimensions and vice versa + +**Stage 4 - More custom user interface (UI) / Collaboration** +* A place where users can see all the relevant information about a given metric +* Organize metrics by hierarchy and more advanced search features (such as filter on the type of metric or other metadata) +* Use and expose more metadata +* Querying dimensions without metrics and other more advanced querying functionality +* Suggest metrics to users based on teams/identity, and so on. + +### A note on transparency and using compile + +For transparency and additional context, we recommend you have an easy way for the user to obtain the SQL that MetricFlow generates. Depending on what API you are using, you can do this by using our compile parameter. This is incredibly powerful because we want to be very transparent to the user about what we're doing and do not want to be a black box. This would be mostly beneficial to a technical user. + + +### A note on where filters + +In the cases where our APIs support either a string or a filter list for the `where` clause, we always recommend that your application utilizes the filter list in order to gain maximum pushdown benefits. The `where` string may be more intuitive for users writing queries during testing, but it will not have the performance benefits of the filter list in a production environment. + +## Related docs + +- [Use the dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) to learn about the product. +- [Build your metrics](/docs/build/build-metrics-intro) for more info about MetricFlow and its components. +- [dbt Semantic Layer integrations page](https://www.getdbt.com/product/semantic-layer-integrations) for information about the available partner integrations. diff --git a/website/docs/guides/legacy/best-practices.md b/website/docs/guides/legacy/best-practices.md index 5a6983bb3fe..1fbcbc72cc1 100644 --- a/website/docs/guides/legacy/best-practices.md +++ b/website/docs/guides/legacy/best-practices.md @@ -16,9 +16,9 @@ We've codified our best practices in Git, in our [Git guide](https://github.com/ ::: ### Use separate development and production environments -dbt makes it easy to maintain separate production and development environments through the use of target within a profile. We recommend using a `dev` target when running dbt from your command line, and only running against a `prod` target when running from a production deployment. You can read more [about managing environments](/docs/collaborate/environments). +dbt makes it easy to maintain separate production and development environments through the use of targets within a profile. We recommend using a `dev` target when running dbt from your command line and only running against a `prod` target when running from a production deployment. You can read more [about managing environments here](/docs/environments-in-dbt). -### Use a style guide and for your project +### Use a style guide for your project SQL styles, field naming conventions, and other rules for your dbt project should be codified, especially on projects where multiple dbt users are writing code. :::info Our style guide @@ -30,7 +30,7 @@ We've made our [style guide](https://github.com/dbt-labs/corp/blob/main/dbt_styl ## Best practices in dbt projects ### Use the ref function -The [ref](ref) function is what makes dbt so powerful! Using the `ref` function allows dbt to infer dependencies, ensuring that models are built in the correct order. It also ensures that your current model selects from upstream tables and views in the same environment that you're working in. +The [ref](/reference/dbt-jinja-functions/ref) function is what makes dbt so powerful! Using the `ref` function allows dbt to infer dependencies, ensuring that models are built in the correct order. It also ensures that your current model selects from upstream tables and views in the same environment that you're working in. Always use the `ref` function when selecting from another model, rather than using the direct relation reference (e.g. `my_schema.my_table`). ### Limit references to raw data @@ -57,7 +57,7 @@ All subsequent data models should be built on top of these models, reducing the Earlier versions of this documentation recommended implementing “base models” as the first layer of transformation, and gave advice on the SQL within these models. We realized that while the reasons behind this convention were valid, the specific advice around "base models" represented an opinion, so we moved it out of the official documentation. -You can instead find our opinions on how we structure our dbt projects in [this Discourse article](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355). +You can instead find our opinions on [how we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview). ::: @@ -70,7 +70,7 @@ Complex models often include multiple Common Table Expressions (C ### Group your models in directories Within your `models/` directory, you can have any number of nested subdirectories. We leverage directories heavily, since using a nested structure within directories makes it easier to: * Configure groups of models, by specifying configurations in your `dbt_project.yml` file. -* Run subsections of your DAG, by using the [model selection syntax](node-selection/syntax). +* Run subsections of your DAG, by using the [model selection syntax](/reference/node-selection/syntax). * Communicate modeling steps to collaborators * Create conventions around the allowed upstream dependencies of a model, for example, "models in the `marts` directory can only select from other models in the `marts` directory, or from models in the `staging` directory". @@ -92,7 +92,7 @@ When a user connects to a via a SQL client, they of * Use prefixes in names (for example, `stg_`, `fct_` and `dim_`) to indicate which relations should be queried by end users. ### Choose your materializations wisely -[](materializations) determine the way models are built through configuration. As a general rule: +[](/docs/build/materializations) determine the way models are built through configuration. As a general rule: * Views are faster to build, but slower to query compared to tables. * Incremental models provide the same query performance as tables, are faster to build compared to the table , however they introduce complexity into a project. @@ -105,25 +105,21 @@ We often: ## Pro-tips for workflows ### Use the model selection syntax when running locally -When developing, it often makes sense to only run the model you are actively working on and any downstream models. You can choose which models to run by using the [model selection syntax](node-selection/syntax). +When developing, it often makes sense to only run the model you are actively working on and any downstream models. You can choose which models to run by using the [model selection syntax](/reference/node-selection/syntax). ### Run only modified models to test changes ("slim CI") -To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/cloud-ci-job). +To merge code changes with confidence, you want to know that those changes will not cause breakages elsewhere in your project. For that reason, we recommend running models and tests in a sandboxed environment, separated from your production data, as an automatic check in your git workflow. (If you use GitHub and dbt Cloud, read about [how to set up CI jobs](/docs/deploy/ci-jobs). At the same time, it costs time (and money) to run and test all the models in your project. This inefficiency feels especially painful if your PR only proposes changes to a handful of models. -New in v0.18.0 - By comparing to artifacts from a previous production run, dbt can determine which models are modified and build them on top of of their unmodified parents. ```bash dbt run -s state:modified+ --defer --state path/to/prod/artifacts -dbt test -s state:modified+ +dbt test -s state:modified+ --defer --state path/to/prod/artifacts ``` -New in v1.0.0 - By comparing to artifacts from a previous production run, dbt can determine model and test result statuses. - `result:fail` @@ -159,17 +155,6 @@ dbt test --select result:fail --exclude --defer --state path/to/p > Note: If you're using the `--state target/` flag, `result:error` and `result:fail` flags can only be selected concurrently(in the same command) if using the `dbt build` command. `dbt test` will overwrite the `run_results.json` from `dbt run` in a previous command invocation. -:::caution Experimental functionality -The `source_status` selection method is experimental and subject to change. During this time, ongoing improvements may limit this feature’s availability and cause breaking changes to its functionality. -::: - - - -Only supported by v1.1 or newer. - - - - Only supported by v1.1 or newer. @@ -188,13 +173,11 @@ dbt source freshness # must be run again to compare current to previous state dbt build --select source_status:fresher+ --state path/to/prod/artifacts ``` - - -To learn more, read the docs on [state](understanding-state). +To learn more, read the docs on [state](/reference/node-selection/syntax#about-node-selection). ## Pro-tips for dbt Projects ### Limit the data processed when in development -In a development environment, faster run times allow you to iterate your code more quickly. We frequently speed up our runs by using a pattern that limits data based on the [target](target) name: +In a development environment, faster run times allow you to iterate your code more quickly. We frequently speed up our runs by using a pattern that limits data based on the [target](/reference/dbt-jinja-functions/target) name: ```sql select * @@ -205,7 +188,7 @@ where created_at >= dateadd('day', -3, current_date) ``` ### Use hooks to manage privileges on objects that dbt creates -Use `grant` statements from [hooks](hooks-operations) to ensure that permissions are applied to the objects created by dbt. By codifying these grant statements in hooks, you can version control and repeatably apply these permissions. +Use `grant` statements from [hooks](/docs/build/hooks-operations) to ensure that permissions are applied to the objects created by dbt. By codifying these grant statements in hooks, you can version control and repeatably apply these permissions. :::info Recommended grant statements diff --git a/website/docs/guides/legacy/building-packages.md b/website/docs/guides/legacy/building-packages.md index 55f9821852e..2a6803334d4 100644 --- a/website/docs/guides/legacy/building-packages.md +++ b/website/docs/guides/legacy/building-packages.md @@ -23,13 +23,13 @@ Packages are _not_ a good fit for sharing models that contain business-specific We tend to use the CLI for package development. The development workflow often involves installing a local copy of your package in another dbt project — at present dbt Cloud is not designed for this workflow. ::: -1. Use the [dbt init](init) command to create a new dbt project, which will be your package: +1. Use the [dbt init](/reference/commands/init) command to create a new dbt project, which will be your package: ```shell $ dbt init [package_name] ``` 2. Create a public GitHub¹ repo, named `dbt-`, e.g. `dbt-mailchimp`. Follow the GitHub instructions to link this to the dbt project you just created. 3. Update the `name:` of the project in `dbt_project.yml` to your package name, e.g. `mailchimp`. -4. Define the allowed dbt versions by using the [`require-dbt-version` config](require-dbt-version). +4. Define the allowed dbt versions by using the [`require-dbt-version` config](/reference/project-configs/require-dbt-version). ¹Currently, our package registry only supports packages that are hosted in GitHub. @@ -41,7 +41,7 @@ When working on your package, we often find it useful to install a local copy of ### Follow our best practices _Modeling packages only_ -Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355), and our [best practices](best-practices) for all of our advice on how to build your dbt project. +Use our [dbt coding conventions](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md), our article on [how we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview), and our [best practices](best-practices) for all of our advice on how to build your dbt project. This is where it comes in especially handy to have worked on your own dbt project previously. @@ -74,8 +74,8 @@ Many SQL functions are specific to a particular database. For example, the funct If you wish to support multiple warehouses, we have a number of tricks up our sleeve: - We've written a number of macros that compile to valid SQL snippets on each of the original four adapters. Where possible, leverage these macros. -- If you need to implement cross-database compatibility for one of your macros, use the [`adapter.dispatch` macro](dispatch) to achieve this. Check out the cross-database macros in dbt-utils for examples. -- If you're working on a modeling package, you may notice that you need write different models for each warehouse (for example, if the EL tool you are working with stores data differently on each warehouse). In this case, you can write different versions of each model, and use the [`enabled` config](enabled), in combination with [`target.type`](/reference/dbt-jinja-functions/target) to enable the correct models — check out [this package](https://github.com/fivetran/dbt_facebook_ads_creative_history/blob/main/dbt_project.yml#L11-L16) as an example. +- If you need to implement cross-database compatibility for one of your macros, use the [`adapter.dispatch` macro](/reference/dbt-jinja-functions/dispatch) to achieve this. Check out the cross-database macros in dbt-utils for examples. +- If you're working on a modeling package, you may notice that you need write different models for each warehouse (for example, if the EL tool you are working with stores data differently on each warehouse). In this case, you can write different versions of each model, and use the [`enabled` config](/reference/resource-configs/enabled), in combination with [`target.type`](/reference/dbt-jinja-functions/target) to enable the correct models — check out [this package](https://github.com/fivetran/dbt_facebook_ads_creative_history/blob/main/dbt_project.yml#L11-L16) as an example. If your package has only been written to work for one , make sure you document this in your package README. @@ -94,7 +94,7 @@ The major exception to this is when working with data sources that benefit from ### Test and document your package It's critical that you [test](/docs/build/tests) your models and sources. This will give your end users confidence that your package is actually working on top of their dataset as intended. -Further, adding [documentation](documentation) via descriptions will help communicate your package to end users, and benefit their stakeholders that use the outputs of this package. +Further, adding [documentation](/docs/collaborate/documentation) via descriptions will help communicate your package to end users, and benefit their stakeholders that use the outputs of this package. ### Include useful GitHub artifacts Over time, we've developed a set of useful GitHub artifacts that make administering our packages easier for us. In particular, we ensure that we include: - A useful README, that has: @@ -126,7 +126,7 @@ packages:
          4. Add resources to the package (seeds, models, tests) so that you can successfully run your project, and compare the output with what you expect. The exact appraoch here will vary depending on your packages. In general you will find that you need to: - - Add mock data via a [seed](seeds) with a few sample (anonymized) records. Configure the `integration_tests` project to point to the seeds instead of raw data tables. + - Add mock data via a [seed](/docs/build/seeds) with a few sample (anonymized) records. Configure the `integration_tests` project to point to the seeds instead of raw data tables. - Add more seeds that represent the expected output of your models, and use the [dbt_utils.equality](https://github.com/dbt-labs/dbt-utils#equality-source) test to confirm the output of your package, and the expected output matches. diff --git a/website/docs/guides/legacy/creating-date-partitioned-tables.md b/website/docs/guides/legacy/creating-date-partitioned-tables.md index 6dda1579192..8c461dbe4a8 100644 --- a/website/docs/guides/legacy/creating-date-partitioned-tables.md +++ b/website/docs/guides/legacy/creating-date-partitioned-tables.md @@ -6,7 +6,7 @@ id: "creating-date-partitioned-tables" :::caution Deprecated -The functionality described below was introduced in dbt Core v0.10 (March 2018). In v1.0 (December 2021), it was deprecated in favor of [column-based partitioning](bigquery-configs#partition-clause) and [incremental modeling](/docs/build/incremental-models). +The functionality described below was introduced in dbt Core v0.10 (March 2018). In v1.0 (December 2021), it was deprecated in favor of [column-based partitioning](/reference/resource-configs/bigquery-configs#partition-clause) and [incremental modeling](/docs/build/incremental-models). ::: diff --git a/website/docs/guides/legacy/debugging-schema-names.md b/website/docs/guides/legacy/debugging-schema-names.md index 3a786afe5b7..dee2bc57293 100644 --- a/website/docs/guides/legacy/debugging-schema-names.md +++ b/website/docs/guides/legacy/debugging-schema-names.md @@ -2,7 +2,7 @@ title: Debugging schema names --- -If a model uses the [`schema` config](resource-configs/schema) but builds under an unexpected schema, here are some steps for debugging the issue. +If a model uses the [`schema` config](/reference/resource-properties/schema) but builds under an unexpected schema, here are some steps for debugging the issue. :::info The full explanation on custom schemas can be found [here](/docs/build/custom-schemas). @@ -16,7 +16,7 @@ You can also follow along via this video: Do a file search to check if you have a macro named `generate_schema_name` in the `macros` directory of your project. #### I do not have a macro named `generate_schema_name` in my project -This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L17-L30) +This means that you are using dbt's default implementation of the macro, as defined [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47C1-L60) ```sql {% macro generate_schema_name(custom_schema_name, node) -%} @@ -44,8 +44,7 @@ If your `generate_schema_name` macro looks like so: {{ generate_schema_name_for_env(custom_schema_name, node) }} {%- endmacro %} ``` -Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/include/global_project/macros/etc/get_custom_schema.sql#L43-L56). - +Your project is switching out the `generate_schema_name` macro for another macro, `generate_schema_name_for_env`. Similar to the above example, this is a macro which is defined in dbt's global project, [here](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/get_custom_name/get_custom_schema.sql#L47-L60). ```sql {% macro generate_schema_name_for_env(custom_schema_name, node) -%} @@ -70,11 +69,11 @@ In all cases take a moment to read through the Jinja to see if you can follow th ### 2. Confirm your `schema` config -Check if you are using the [`schema` config](resource-configs/schema) in your model, either via a `{{ config() }}` block, or from `dbt_project.yml`. In both cases, dbt passes this value as the `custom_schema_name` parameter of the `generate_schema_name` macro. +Check if you are using the [`schema` config](/reference/resource-properties/schema) in your model, either via a `{{ config() }}` block, or from `dbt_project.yml`. In both cases, dbt passes this value as the `custom_schema_name` parameter of the `generate_schema_name` macro. ### 3. Confirm your target values -Most `generate_schema_name` macros incorporate logic from the [`target` variable](target), in particular `target.schema` and `target.name`. Use the docs [here](target) to help you find the values of each key in this dictionary. +Most `generate_schema_name` macros incorporate logic from the [`target` variable](/reference/dbt-jinja-functions/target), in particular `target.schema` and `target.name`. Use the docs [here](/reference/dbt-jinja-functions/target) to help you find the values of each key in this dictionary. ### 4. Put the two together @@ -84,7 +83,7 @@ Now, re-read through the logic of your `generate_schema_name` macro, and mentall You should find that the schema dbt is constructing for your model matches the output of your `generate_schema_name` macro. :::info -Note that snapshots do not follow this behavior, check out the docs on [target_schema](resource-configs/target_schema) instead. +Note that snapshots do not follow this behavior, check out the docs on [target_schema](/reference/resource-configs/target_schema) instead. ::: ### 5. Adjust as necessary diff --git a/website/docs/guides/legacy/understanding-state.md b/website/docs/guides/legacy/understanding-state.md deleted file mode 100644 index 9573452e376..00000000000 --- a/website/docs/guides/legacy/understanding-state.md +++ /dev/null @@ -1,101 +0,0 @@ ---- -title: "Understanding state" ---- - - - - - The `--state` flag was introduced in dbt v0.18.0 - - The `result` selector was introduced in dbt v1.0.0 - - - -One of the greatest underlying assumptions about dbt is that its operations should be **stateless** and ****. That is, it doesn't matter how many times a model has been run before, or if it has ever been run before. It doesn't matter if you run it once or a thousand times. Given the same raw data, you can expect the same transformed result. A given run of dbt doesn't need to "know" about _any other_ run; it just needs to know about the code in the project and the objects in your database as they exist _right now_. - -That said, dbt does store "state"—a detailed, point-in-time view of project resources, database objects, and invocation results—in the form of its [artifacts](dbt-artifacts). If you choose, dbt can use these artifacts to inform certain operations. Crucially, the operations themselves are still stateless and : given the same manifest and the same raw data, dbt will produce the same transformed result. - -dbt can leverage artifacts from a prior invocation as long as their file path is passed to the `--state` flag. This is a prerequsite for: -- [The `state:` selector](methods#the-state-method), whereby dbt can identify resources that are new or modified -by comparing code in the current project against the state manifest. -- [Deferring](defer) to another environment, whereby dbt can identify upstream, unselected resources that don't exist in your current environment and instead "defer" their references to the environment provided by the state manifest. - -Together, these two features enable ["slim CI"](best-practices#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag. - -### Establishing state - -State and defer can be set by environment variables as well as CLI flags: - -- `--state` or `DBT_ARTIFACT_STATE_PATH`: file path -- `--defer` or `DBT_DEFER_TO_STATE`: boolean - -If both the flag and env var are provided, the flag takes precedence. - -#### Notes: -- The `--state` artifacts must be of schema versions that are compatible with the currently running dbt version. -- The path to state artifacts can be set via the `--state` flag or `DBT_ARTIFACT_STATE_PATH` environment variable. If both the flag and env var are provided, the flag takes precedence. -- These are powerful, complex features. Read about [known caveats and limitations](node-selection/state-comparison-caveats) to state comparison. - -### The "result" status - -Another element of job state is the `result` of a prior dbt invocation. After executing a `dbt run`, for example, dbt creates the `run_results.json` artifact which contains execution times and success / error status for dbt models. You can read more about `run_results.json` on the ['run results'](/docs/reference/artifacts/run-results-json) page. - -The following dbt commands produce `run_results.json` artifacts whose results can be referenced in subsequent dbt invocations: -- `dbt run` -- `dbt test` -- `dbt build` (new in dbt version v0.21.0) -- `dbt seed` - -After issuing one of the above commands, you can reference the results by adding a selector to a subsequent command as follows: - -```bash -# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt run --select result: --defer --state path/to/prod/artifacts -``` - -The available options depend on the node type: - -| | model | seed | snapshot | test | -|----------------|-------|------|------|----------| -| `result:error` | ✅ | ✅ | ✅ | ✅ | -| `result:success` | ✅ | ✅ | ✅ | | -| `result:skipped` | ✅ | | ✅ | ✅ | -| `result:fail` | | | | ✅ | -| `result:warn` | | | | ✅ | -| `result:pass` | | | | ✅ | - -### Combining `state` and `result` selectors - -The state and result selectors can also be combined in a single invocation of dbt to capture errors from a previous run AND any new or modified models. - -```bash -$ dbt run --select result:+ state:modified+ --defer --state ./ -``` - -### The "source_status" status - - -Only supported by v1.1 or newer. - - - - - -Only supported by v1.1 or newer. - -:::caution Experimental functionality -The `source_status` selection method is experimental and subject to change. During this time, ongoing improvements may limit this feature’s availability and cause breaking changes to its functionality. -::: - - -Another element of job state is the `source_status` of a prior dbt invocation. After executing `dbt source freshness`, for example, dbt creates the `sources.json` artifact which contains execution times and `max_loaded_at` dates for dbt sources. You can read more about `sources.json` on the ['sources'](/docs/reference/artifacts/sources-json) page. - -The following dbt commands produce `sources.json` artifacts whose results can be referenced in subsequent dbt invocations: -- `dbt source freshness` - -After issuing one of the above commands, you can reference the source freshness results by adding a selector to a subsequent command as follows: - -```bash -# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt source freshness # must be run again to compare current to previous state -$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts -``` - \ No newline at end of file diff --git a/website/docs/guides/migration/sl-migration.md b/website/docs/guides/migration/sl-migration.md new file mode 100644 index 00000000000..56cd6dc9d80 --- /dev/null +++ b/website/docs/guides/migration/sl-migration.md @@ -0,0 +1,127 @@ +--- +title: "Legacy dbt Semantic Layer migration guide" +sidebar_label: "Legacy dbt Semantic Layer migration" +description: "Learn how to migrate from the legacy dbt Semantic Layer to the latest one." +tags: [Semantic Layer] +--- + +The legacy Semantic Layer will be deprecated in H2 2023. Additionally, the `dbt_metrics` package will not be supported in dbt v1.6 and later. If you are using `dbt_metrics`, you'll need to upgrade your configurations before upgrading to v1.6. This guide is for people who have the legacy dbt Semantic Layer setup and would like to migrate to the new dbt Semantic Layer. The estimated migration time is two weeks. + + +## Step 1: Migrate metric configs to the new spec + +The metrics specification in dbt Core is changed in v1.6 to support the integration of MetricFlow. It's strongly recommended that you refer to [Build your metrics](/docs/build/build-metrics-intro) and before getting started so you understand the core concepts of the Semantic Layer. + +dbt Labs recommends completing these steps in a local dev environment (such as the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation)) instead of the dbt Cloud IDE: + +1. Create new Semantic Model configs as YAML files in your dbt project.* +1. Upgrade the metrics configs in your project to the new spec.* +1. Delete your old metrics file or remove the `.yml` file extension so they're ignored at parse time. Remove the `dbt-metrics` package from your project. Remove any macros that reference `dbt-metrics`, like `metrics.calculate()`. Make sure that any packages you’re using don't have references to the old metrics spec. +1. Install the CLI with `pip install "dbt-metricflow[your_adapter_name]"`. For example: + + ```bash + pip install "dbt-metricflow[snowflake]" + ``` + **Note** - The MetricFlow CLI is not available in the IDE at this time. Support is coming soon. + +1. Run `dbt parse`. This parses your project and creates a `semantic_manifest.json` file in your target directory. MetricFlow needs this file to query metrics. If you make changes to your configs, you will need to parse your project again. +1. Run `mf list metrics` to view the metrics in your project. +1. Test querying a metric by running `mf query --metrics --group-by `. For example: + ```bash + mf query --metrics revenue --group-by metric_time + ``` +1. Run `mf validate-configs` to run semantic and warehouse validations. This ensures your configs are valid and the underlying objects exist in your warehouse. +1. Push these changes to a new branch in your repo. + +**To make this process easier, dbt Labs provides a [custom migration tool](https://github.com/dbt-labs/dbt-converter) that automates these steps for you. You can find installation instructions in the [README](https://github.com/dbt-labs/dbt-converter/blob/master/README.md). Derived metrics aren’t supported in the migration tool, and will have to be migrated manually.* + +## Step 2: Audit metric values after the migration + +You might need to audit metric values during the migration to ensure that the historical values of key business metrics are the same. + +1. In the CLI, query the metric(s) and dimensions you want to test and include the `--explain` option. For example: + ```bash + mf query --metrics orders,revenue --group-by metric_time__month,customer_type --explain + ``` +1. Use SQL MetricFlow to create a temporary model in your project, like `tmp_orders_revenue audit.sql`. You will use this temporary model to compare against your legacy metrics. +1. If you haven’t already done so, create a model using `metrics.calculate()` for the metrics you want to compare against. For example: + + ```bash + select * + from {{ metrics.calculate( + [metric('orders)', + metric('revenue)'], + grain='week', + dimensions=['metric_time', 'customer_type'], + ) }} + ``` + +1. Run the [dbt-audit](https://github.com/dbt-labs/dbt-audit-helper) helper on both models to compare the metric values. + +## Step 3: Setup the Semantic Layer in a new environment + +This step is only relevant to users who want the legacy and new semantic layer to run in parallel for a short time. This will let you recreate content in downstream tools like Hex and Mode with minimal downtime. If you do not need to recreate assets in these tools skip to step 5. + +1. Create a new deployment environment in dbt Cloud and set the dbt version to 1.6 or higher. + +2. Select **Only run on a custom branch** and point to the branch that has the updated metric definition. + +3. Set the deployment schema to a temporary migration schema, such as `tmp_sl_migration`. Optional, you can create a new database for the migration. + +4. Create a job to parse your project, such as `dbt parse`, and run it. Make sure this job succeeds. There needs to be a successful job in your environment in order to set up the semantic layer. + +5. Select **Account Settings** -> **Projects** -> **Project details** and choose **Configure the Semantic Layer**. + +6. Under **Environment**, select the deployment environment you created in the previous step. Save your configuration. + +7. In the **Project details** page, click **Generate service token** and grant it **Semantic Layer Only** and **Metadata Only** permissions. Save this token securely. You will need it to connect to the semantic layer. + + +At this point, both the new semantic layer and the old semantic layer will be running. The new semantic layer will be pointing at your migration branch with the updated metrics definitions. + +## Step 4: Update connection in downstream integrations + +Now that your Semantic Layer is set up, you will need to update any downstream integrations that used the legacy Semantic Layer. + +### Migration guide for Hex + +To learn more about integrating with Hex, check out their [documentation](https://learn.hex.tech/docs/connect-to-data/data-connections/dbt-integration#dbt-semantic-layer-integration) for more info. Additionally, refer to [dbt Semantic Layer cells](https://learn.hex.tech/docs/logic-cell-types/transform-cells/dbt-metrics-cells) to set up SQL cells in Hex. + +1. Set up a new connection for the Semantic Layer for your account. Something to note is that your old connection will still work. The following Loom video guides you in setting up your Semantic Layer with Hex: + + + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + + * **Note** — You will need to update your connection to your production environment once you merge your changes to main. Currently, this connection will be pointing at the semantic layer migration environment + +### Migration guide for Mode + +1. Set up a new connection for the semantic layer for your account. Follow [Mode's docs to setup your connection](https://mode.com/help/articles/supported-databases/#dbt-semantic-layer). + +2. Re-create the dashboards or reports that use the legacy dbt Semantic Layer. + +3. For specific SQL syntax details, refer to [Querying the API for metric metadata](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) to query metrics using the API. + +## Step 5: Merge your metrics migration branch to main, and upgrade your production environment to 1.6. + +1. Upgrade your production environment to 1.6 or higher. + * **Note** — The old metrics definitions are no longer valid so your dbt jobs will not pass. + +2. Merge your updated metrics definitions to main. **At this point the legacy semantic layer will no longer work.** + +If you created a new environment in [Step 3](#step-3-setup-the-semantic-layer-in-a-new-environment): + +3. Update your Environment in **Account Settings** -> **Project Details** -> **Edit Semantic Layer Configuration** to point to your production environment + +4. Delete your migration environment. Be sure to update your connection details in any downstream tools to account for the environment change. + +## Related docs + +- [MetricFlow quickstart guide](/docs/build/sl-getting-started) +- [Example dbt project](https://github.com/dbt-labs/jaffle-sl-template) +- [dbt metrics converter](https://github.com/dbt-labs/dbt-converter) +- [Why we're deprecating the dbt_metrics package](/blog/deprecating-dbt-metrics) blog post +- [dbt Semantic Layer API query syntax](/docs/dbt-cloud-apis/sl-jdbc#querying-the-api-for-metric-metadata) diff --git a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md b/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md index 1a7d41600ba..f5549c58416 100644 --- a/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md +++ b/website/docs/guides/migration/tools/migrating-from-spark-to-databricks.md @@ -3,77 +3,95 @@ title: "Migrating from dbt-spark to dbt-databricks" id: "migrating-from-spark-to-databricks" --- +You can [migrate your projects](#migrate-your-dbt-projects) from using the `dbt-spark` adapter to using the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks). In collaboration with dbt Labs, Databricks built this adapter using dbt-spark as the foundation and added some critical improvements. With it, you get an easier set up — requiring only three inputs for authentication — and more features such as support for [Unity Catalog](https://www.databricks.com/product/unity-catalog). -## Pre-requisites +## Simpler authentication -In order to migrate to dbt-databricks, your project must be compatible with `dbt 1.0` or greater as dbt-databricks is not supported pre `dbt 1.0`. [This guide](https://docs.getdbt.com/guides/migration/versions/upgrading-to-v1.0) will help you upgrade your project if necessary. +Previously, you had to provide a `cluster` or `endpoint` ID which was hard to parse from the `http_path` that you were given. Now, it doesn't matter if you're using a cluster or an SQL endpoint because the [dbt-databricks setup](/docs/core/connect-data-platform/databricks-setup) requires the _same_ inputs for both. All you need to provide is: +- hostname of the Databricks workspace +- HTTP path of the Databricks SQL warehouse or cluster +- appropriate credentials -## Why change to dbt-databricks? +## Better defaults -The Databricks team, in collaboration with dbt Labs, built on top of the foundation that the dbt Labs’ dbt-spark adapter provided, and they added some critical improvements. The dbt-databricks adapter offers an easier set up, as it only requires three inputs for authentication, and it also has more features available via the Delta file format. +The `dbt-databricks` adapter provides better defaults than `dbt-spark` does. The defaults help optimize your workflow so you can get the fast performance and cost-effectiveness of Databricks. They are: -### Authentication Simplification +- The dbt models use the [Delta](https://docs.databricks.com/delta/index.html) table format. You can remove any declared configurations of `file_format = 'delta'` since they're now redundant. +- Accelerate your expensive queries with the [Photon engine](https://docs.databricks.com/runtime/photon.html). +- The `incremental_strategy` config is set to `merge`. -Previously users had to provide a `cluster` or `endpoint` ID which was hard to parse out of the http_path provided in the Databricks UI. Now the [dbt-databricks profile](https://docs.getdbt.com/reference/warehouse-setups/databricks-setup) requires the same inputs regardless if you are using a Cluster or a SQL endpoint. All you need to provide is: -- the hostname of the Databricks workspace -- the HTTP path of the Databricks SQL warehouse or cluster -- an appropriate credential +With dbt-spark, however, the default for `incremental_strategy` is `append`. If you want to continue using `incremental_strategy=append`, you must set this config specifically on your incremental models. If you already specified `incremental_strategy=merge` on your incremental models, you don't need to change anything when moving to dbt-databricks; but, you can keep your models clean (tidy) by removing the config since it's redundant. Read [About incremental_strategy](/docs/build/incremental-models#about-incremental_strategy) to learn more. +For more information on defaults, see [Caveats](/docs/core/connect-data-platform/databricks-setup#caveats). -### Better defaults +## Pure Python -With dbt-databricks, by default, dbt models will use the Delta format and expensive queries will be accelerated with the [Photon engine](https://docs.databricks.com/runtime/photon.html). See [the caveats section of Databricks Profile documentation](https://docs.getdbt.com/reference/warehouse-profiles/databricks-profile#choosing-between-dbt-databricks-and-dbt-spark) for more information. Any declared configurations of `file_format = 'delta'` are now redundant and can be removed. +If you use dbt Core, you no longer have to download an independent driver to interact with Databricks. The connection information is all embedded in a pure-Python library called `databricks-sql-connector`. -Additionally, dbt-databricks's default `incremental_strategy` is now `merge`. The default `incremental_strategy` with dbt-spark is `append`. -If you have been using the default `incremental_strategy=append` with dbt-spark, and would like to continue doing so, you'll have to set this config specifically on your incremental models. Read more [about `incremental_strategy` in dbt](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#about-incremental_strategy). -If you already specified `incremental_strategy=merge` on your incremental models, you do not need to change anything when moving to dbt-databricks, though you could remove the param as it is now the default. -### Pure Python (Core only) +## Migrate your dbt projects -A huge benefit to Core only users is that with the new dbt-databricks adapter, you no longer have to download an independent driver to interact with Databricks. The connection information is all embedded in a pure-Python library, `databricks-sql-connector`. +In both dbt Core and dbt Cloud, you can migrate your projects to the Databricks-specific adapter from the generic Apache Spark adapter. +### Prerequisites -## Migration -### dbt Cloud +- Your project must be compatible with dbt 1.0 or greater. Refer to [Upgrading to v1.0](/guides/migration/versions/upgrading-to-v1.0) for details. For the latest version of dbt, refer to [Upgrading to v1.3](/guides/migration/versions/upgrading-to-v1.3). +- For dbt Cloud, you need administrative (admin) privileges to migrate dbt projects. -#### Credentials -If you are already successfully connected to Databricks using the dbt-spark ODBC method in dbt Cloud, then you have already supplied credentials in dbt Cloud to connect to your Databricks workspace. Each user will have added their Personal Access Token in their dbt Cloud profile for the given dbt project, which allows them to connect to Databricks in the dbt Cloud IDE, and additionally, an admin will have added an access token for each deployment environment, allowing for dbt Cloud to connect to Databricks during production jobs. + + -When an admin changes the dbt Cloud's connection to use the dbt-databricks adapter instead of the dbt-spark adapter, your team will not lose their credentials. This makes migrating from dbt-spark to dbt-databricks straightforward as it only requires deleting the connection and re-adding the cluster/endpoint information. Both the admin and users of the project need not re-enter personal access tokens. + -#### Procedure +The migration to the `dbt-databricks` adapter from `dbt-spark` shouldn't cause any downtime for production jobs. dbt Labs recommends that you schedule the connection change when usage of the IDE is light to avoid disrupting your team. -An admin of the dbt Cloud project running on Databricks should take the following steps to migrate from using the generic Spark adapter to the Databricks-specfic adapter. This should not cause any downtime for production jobs, but we recommend that you schedule the connection change when there is not heavy IDE usage for your team to avoid disruption. +To update your Databricks connection in dbt Cloud: 1. Select **Account Settings** in the main navigation bar. -2. On the Projects tab, scroll until you find the project you'd like to migrate to the new dbt-databricks adapter. +2. On the **Projects** tab, find the project you want to migrate to the dbt-databricks adapter. 3. Click the hyperlinked Connection for the project. -4. Click the "Edit" button in the top right corner. -5. Select Databricks for the warehouse -6. Select Databricks (dbt-databricks) for the adapter and enter: - 1. the `hostname` - 2. the `http_path` - 3. optionally the catalog name -7. Click save. +4. Click **Edit** in the top right corner. +5. Select **Databricks** for the warehouse +6. Select **Databricks (dbt-databricks)** for the adapter and enter the: + 1. `hostname` + 2. `http_path` + 3. (optional) catalog name +7. Click **Save**. -After the above steps have been performed, all users will have to refresh their IDE before being able to start working again. It should complete in less than a minute. +Everyone in your organization who uses dbt Cloud must refresh the IDE before starting work again. It should refresh in less than a minute. +#### About your credentials +When you update the Databricks connection in dbt Cloud, your team will not lose their credentials. This makes migrating easier since it only requires you to delete the Databricks connection and re-add the cluster or endpoint information. +These credentials will not get lost when there's a successful connection to Databricks using the `dbt-spark` ODBC method: +- The credentials you supplied to dbt Cloud to connect to your Databricks workspace. +- The personal access tokens your team added in their dbt Cloud profile so they can develop in the IDE for a given project. +- The access token you added for each deployment environment so dbt Cloud can connect to Databricks during production jobs. -### dbt Core + -In dbt Core, migrating to the dbt-databricks adapter from dbt-spark requires that you: -1. install the new adapter in your environment, and -2. modify your target in your `~/.dbt/profiles.yml` + -These changes will be needed for all users of your project. +To migrate your dbt Core projects to the `dbt-databricks` adapter from `dbt-spark`, you: +1. Install the [dbt-databricks adapter](https://github.com/databricks/dbt-databricks) in your environment +1. Update your Databricks connection by modifying your `target` in your `~/.dbt/profiles.yml` file -#### Example +Anyone who's using your project must also make these changes in their environment. -If you're using `dbt-spark` today to connect to a Databricks SQL Endpoint, the below examples show a good before and after of how to authenticate. The cluster example is also effectively the same. + + + + + +### Examples + +You can use the following examples of the `profiles.yml` file to see the authentication setup with `dbt-spark` compared to the simpler setup with `dbt-databricks` when connecting to an SQL endpoint. A cluster example would look similar. + + +An example of what authentication looks like with `dbt-spark`: @@ -89,11 +107,13 @@ your_profile_name: host: dbc-l33t-nwb.cloud.databricks.com endpoint: 8657cad335ae63e3 token: [my_secret_token] - + ``` +An example of how much simpler authentication is with `dbt-databricks`: + ```yaml @@ -108,4 +128,4 @@ your_profile_name: token: [my_secret_token] ``` - \ No newline at end of file + diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md index 6a5ba57f1a8..d8f31a0f14a 100644 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md +++ b/website/docs/guides/migration/tools/migrating-from-stored-procedures/2-mapping-inserts.md @@ -13,7 +13,7 @@ INSERT INTO returned_orders (order_id, order_date, total_return) SELECT order_id, order_date, total FROM orders WHERE type = 'return' ``` -Converting this with a first pass to a [dbt model](/docs/get-started/getting-started/building-your-first-project/build-your-first-models) (in a file called returned_orders.sql) might look something like: +Converting this with a first pass to a [dbt model](/quickstarts/bigquery?step=8) (in a file called returned_orders.sql) might look something like: ```sql SELECT diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md index f7a6542acc7..1a8c6435d42 100644 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md +++ b/website/docs/guides/migration/tools/migrating-from-stored-procedures/4-mapping-deletes.md @@ -42,4 +42,4 @@ SELECT * FROM soft_deletes WHERE to_delete = false This approach flags all of the deleted records, and the final `SELECT` filters out any deleted data, so the resulting table contains only the remaining records. It’s a lot more verbose than just inverting the `DELETE` logic, but for complex `DELETE` logic, this ends up being a very effective way of performing the `DELETE` that retains historical context. -It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros/#macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide. +It’s worth calling out that while this doesn’t enable a hard delete, hard deletes can be executed a number of ways, the most common being to execute a dbt [macros](/docs/build/jinja-macros) via as a [run-operation](https://docs.getdbt.com/reference/commands/run-operation), or by using a [post-hook](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook/) to perform a `DELETE` statement after the records to-be-deleted have been marked. These are advanced approaches outside the scope of this guide. diff --git a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md b/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md index 3a2c932c404..d059ab9a258 100644 --- a/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md +++ b/website/docs/guides/migration/tools/migrating-from-stored-procedures/5-mapping-merges.md @@ -9,7 +9,7 @@ dbt has a concept called [materialization](/docs/build/materializations), which Before we get into the exact details of how to implement an incremental materialization, let’s talk about logic conversion. Extracting the logic of the `MERGE` and handling it as you would an `INSERT` or an `UPDATE` is the easiest way to get started migrating a `MERGE` command. . -To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into an details table daily, and tips may be updated at some later date, and need to be kept up-to-date: +To see how the logic conversion works, we’ll start with an example `MERGE`. In this scenario, imagine a ride sharing app where rides are loaded into a details table daily, and tips may be updated at some later date, and need to be kept up-to-date: ```sql MERGE INTO ride_details USING ( @@ -69,7 +69,7 @@ inserts AS ( SELECT ride_id, subtotal, - NVL(rtl.tip, 0, rtl.tip) + NVL(tip, 0, tip) FROM using_clause diff --git a/website/docs/docs/get-started/learning-more/refactoring-legacy-sql.md b/website/docs/guides/migration/tools/refactoring-legacy-sql.md similarity index 93% rename from website/docs/docs/get-started/learning-more/refactoring-legacy-sql.md rename to website/docs/guides/migration/tools/refactoring-legacy-sql.md index 210eb10a58b..d9acfea6dab 100644 --- a/website/docs/docs/get-started/learning-more/refactoring-legacy-sql.md +++ b/website/docs/guides/migration/tools/refactoring-legacy-sql.md @@ -10,13 +10,13 @@ But in reality, you probably already have some queries or stored procedures that There are two parts to accomplish this: migration and refactoring. In this guide we’re going to learn a process to help us turn legacy SQL code into modular dbt models. -When migrating and refactoring code, it’s of course important to stay organized. We'll do this is by following several steps (jump directly from the right sidebar): +When migrating and refactoring code, it’s of course important to stay organized. We'll do this by following several steps (jump directly from the right sidebar): 1. Migrate your code 1:1 into dbt 2. Implement dbt sources rather than referencing raw database tables 3. Choose a refactoring strategy 4. Implement CTE groupings and cosmetic cleanup -5. Separate data transformations into standardized layers +5. Separate [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) into standardized layers 6. Audit the output of dbt models vs legacy SQL Let's get into it! @@ -38,7 +38,7 @@ To get going, you'll copy your legacy SQL query into your dbt project, by saving Once you've copied it over, you'll want to `dbt run` to execute the query and populate the in your warehouse. -If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [Getting Started guide](/docs/get-started/getting-started/overview) before diving into refactoring. +If this is your first time running dbt, you may want to start with the [Introduction to dbt](/docs/introduction) and the earlier sections of the [quickstart guide](/quickstarts) before diving into refactoring. This step may sound simple, but if you're porting over an existing set of SQL transformations to a new SQL dialect, you will need to consider how your legacy SQL dialect differs from your new SQL flavor, and you may need to modify your legacy code to get it to run at all. @@ -59,7 +59,7 @@ This allows you to call the same table in multiple places with `{{ src('my_sourc We start here for several reasons: #### Source freshness reporting -Using sources unlocks the ability to run [source freshness reporting](docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. +Using sources unlocks the ability to run [source freshness reporting](/docs/build/sources#snapshotting-source-data-freshness) to make sure your raw data isn't stale. #### Easy dependency tracing If you're migrating multiple stored procedures into dbt, with sources you can see which queries depend on the same raw tables. @@ -81,7 +81,7 @@ sources: With a few lines of code in a `.yml` file in your dbt project's `/models` subfolder, you can now version control how your data sources (Snowplow, Shopify, etc) map to actual database tables. -For example, let's say you migrate from one [ETL tool](https://getdbt.com/analytics-engineering/etl-tools-a-love-letter/) to another, and the new tool writes to a new schema in your warehouse. dbt sources allow you to make that update in a single config file, and flip on the change with one pull request to your dbt project. +For example, let's say you migrate from one ETL tool to another, and the new tool writes to a new schema in your warehouse. dbt sources allow you to make that update in a single config file, and flip on the change with one pull request to your dbt project. ## Choose a refactoring strategy There are two ways you can choose to refactor: in-place or alongside. @@ -118,7 +118,7 @@ Means that you will copy your model to a `/marts` folder, and work on changes on We generally recommend the **alongside** approach, which we'll follow in this tutorial. ## Implement CTE groupings -Once you choose your refactoring strategy, you'll want to do some cosmetic cleanups according to your data modeling best practices and start moving code into CTE groupings. This will give you a head start on porting SQL snippets from CTEs into modular [dbt data models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models). +Once you choose your refactoring strategy, you'll want to do some cosmetic cleanups according to your data modeling best practices and start moving code into CTE groupings. This will give you a head start on porting SQL snippets from CTEs into modular [dbt data models](https://docs.getdbt.com/docs/build/models). @@ -204,9 +204,9 @@ This allows anyone after us to easily step through the CTEs when troubleshooting > For more background on CTEs, check out the [dbt Labs style guide](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md#ctes). ## Port CTEs to individual data models -Rather than keep our SQL code confined to one long SQL file, we'll now start splitting it into modular + reusable [dbt data models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models). +Rather than keep our SQL code confined to one long SQL file, we'll now start splitting it into modular + reusable [dbt data models](https://docs.getdbt.com/docs/build/models). -Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355) accordingly. +Internally at dbt Labs, we follow roughly this [data modeling technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) and we [structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) accordingly. We'll follow those structures in this walkthrough, but your team's conventions may of course differ from ours. diff --git a/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md b/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md new file mode 100644 index 00000000000..f350e8955f7 --- /dev/null +++ b/website/docs/guides/migration/versions/00-upgrading-to-v1.7.md @@ -0,0 +1,64 @@ +--- +title: "Upgrading to v1.7 (beta)" +description: New features and changes in dbt Core v1.7 +--- + +## Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/8aaed0e29f9560bc53d9d3e88325a9597318e375/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) +- [Release schedule](https://github.com/dbt-labs/dbt-core/issues/8260) + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x, with the exception of any changes explicitly mentioned below. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### Behavior changes + +dbt Core v1.7 expands the amount of sources you can configure freshness for. Previously, freshness was limited to sources with a `loaded_at_field`; now, freshness can be generated from warehouse metadata tables when available. + +As part of this change, the `loaded_at_field` is no longer required to generate source freshness. If a source has a `freshness:` block, dbt will attempt to calculate freshness for that source: +- If a `loaded_at_field` is provided, dbt will calculate freshness via a select query (previous behavior). +- If a `loaded_at_field` is _not_ provided, dbt will calculate freshness via warehouse metadata tables when possible (new behavior). + +This is a relatively small behavior change, but worth calling out in case you notice that dbt is calculating freshness for _more_ sources than before. To exclude a source from freshness calculations, you have two options: +- Don't add a `freshness:` block. +- Explicitly set `freshness: null` + +## New and changed features and functionality + +- [`dbt docs generate`](/reference/commands/cmd-docs) now supports `--select` to generate [catalog metadata](/reference/artifacts/catalog-json) for a subset of your project. Currently available for Snowflake and Postgres only, but other adapters are coming soon. +- [Source freshness](/docs/deploy/source-freshness) can now be generated from warehouse metadata tables, currently Snowflake only, but other adapters that have metadata tables are coming soon. + +### MetricFlow enhancements + +- Automatically create metrics on measures with [`create_metric: true`](/docs/build/semantic-models). +- Optional [`label`](/docs/build/semantic-models) in semantic_models, measures, dimensions and entities. +- New configurations for semantic models - [enable/disable](/reference/resource-configs/enabled), [group](/reference/resource-configs/group), and [meta](/reference/resource-configs/meta). +- Support `fill_nulls_with` and `join_to_timespine` for metric nodes. +- `saved_queries` extends governance beyond the semantic objects to their consumption. + +### For consumers of dbt artifacts (metadata) + +- The [manifest](/reference/artifacts/manifest-json) schema version has been updated to v11. +- The [run_results](/reference/artifacts/run-results-json) schema version has been updated to v5. +- There are a few specific changes to the [catalog.json](/reference/artifacts/catalog-json): + - Added [node attributes](/reference/artifacts/run-results-json) related to compilation (`compiled`, `compiled_code`, `relation_name`) to the `catalog.json`. + - The nodes dictionary in the `catalog.json` can now be "partial" if `dbt docs generate` is run with a selector. + +### Model governance + +dbt Core v1.5 introduced model governance which we're continuing to refine. v1.7 includes these additional features and functionality: + +- **[Breaking change detection](/reference/resource-properties/versions#detecting-breaking-changes) for models with contracts enforced:** When dbt detects a breaking change to a model with an enforced contract during state comparison, it will now raise an error for versioned models and a warning for models that are not versioned. +- **[Set `access` as a config](/reference/resource-configs/access):** You can now set a model's `access` within config blocks in the model's file or in the `dbt_project.yml` for an entire subfolder at once. +- **[Type aliasing for model contracts](/reference/resource-configs/contract):** dbt will use each adapter's built-in type aliasing for user-provided data types—meaning you can now write `string` always, and dbt will translate to `text` on Postgres/Redshift. This is "on" by default, but you can opt-out. +- **[Raise warning for numeric types](/reference/resource-configs/contract):** Because of issues when putting `numeric` in model contracts without considering that default values such as `numeric(38,0)` might round decimals accordingly. dbt will now warn you if it finds a numeric type without specified precision/scale. + +### Quick hits + +With these quick hits, you can now: +- Configure a [`delimiter`](/reference/resource-configs/delimiter) for a seed file. +- Use packages with the same git repo and unique subdirectory. +- Access the `date_spine` macro directly from dbt-core (moved over from dbt-utils). diff --git a/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md b/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md new file mode 100644 index 00000000000..50b0ca8bc58 --- /dev/null +++ b/website/docs/guides/migration/versions/01-upgrading-to-v1.6.md @@ -0,0 +1,94 @@ +--- +title: "Upgrading to v1.6 (latest)" +description: New features and changes in dbt Core v1.6 +--- + +dbt Core v1.6 has three significant areas of focus: +1. Next milestone of [multi-project deployments](https://github.com/dbt-labs/dbt-core/discussions/6725): improvements to contracts, groups/access, versions; and building blocks for cross-project `ref` +1. Semantic layer re-launch: dbt Core and [MetricFlow](https://docs.getdbt.com/docs/build/about-metricflow) integration +1. Mechanisms to support mature deployment at scale (`dbt clone` and `dbt retry`) + +## Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.6.latest/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) +- [Release schedule](https://github.com/dbt-labs/dbt-core/issues/7481) + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x, with the exception of any changes explicitly mentioned below. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### Behavior changes + +:::info Action required if your project defines `metrics` + +The [spec for metrics](https://github.com/dbt-labs/dbt-core/discussions/7456) has changed and now uses [MetricFlow](/docs/build/about-metricflow). + +::: + +If your dbt project defines metrics, you must migrate to dbt v1.6 because the YAML spec has moved from dbt_metrics to MetricFlow. Any tests you have won't compile on v1.5 or older. + +- dbt Core v1.6 does not support Python 3.7, which reached End Of Life on June 23. Support Python versions are 3.8, 3.9, 3.10, and 3.11. +- As part of the [dbt Semantic layer](/docs/use-dbt-semantic-layer/dbt-sl) re-launch (in beta), the spec for `metrics` has changed significantly. Refer to the [migration guide](/guides/migration/sl-migration) for more info on how to migrate to the re-launched dbt Semantic Layer. +- The manifest schema version is now v10. +- dbt Labs is ending support for Homebrew installation of dbt-core and adapters. See [the discussion](https://github.com/dbt-labs/dbt-core/discussions/8277) for more details. + +### For consumers of dbt artifacts (metadata) + +The [manifest](/reference/artifacts/manifest-json) schema version has been updated to `v10`. Specific changes: +- Addition of `semantic_models` and changes to `metrics` attributes +- Addition of `deprecation_date` as a model property +- Addition of `on_configuration_change` as default node configuration (to support materialized views) +- Small type changes to `contracts` and `constraints` +- Manifest `metadata` includes `project_name` + +### For maintainers of adapter plugins + +For more detailed information and to ask questions, please read and comment on the GH discussion: [dbt-labs/dbt-core#7958](https://github.com/dbt-labs/dbt-core/discussions/7958). + +## New and changed documentation + +### MetricFlow + +- [**Build your metrics**](/docs/build/build-metrics-intro) with MetricFlow, a key component of the dbt Semantic Layer. You can define your metrics and build semantic models with MetricFlow, available on the command line (CLI) for dbt Core v1.6 beta or higher. + +### Materialized views + +Supported on: +- [Postgres](/reference/resource-configs/postgres-configs#materialized-view) +- [Redshift](/reference/resource-configs/redshift-configs#materialized-view) +- [Snowflake](/reference/resource-configs/snowflake-configs#dynamic-tables) +- [Databricks](/reference/resource-configs/databricks-configs#materialized-views-and-streaming-tables) + +Support for BigQuery coming soon. + +### New commands for mature deployment + +[`dbt retry`](/reference/commands/retry) executes the previously run command from the point of failure. Rebuild just the nodes that errored or skipped in a previous run/build/test, rather than starting over from scratch. + +[`dbt clone`](/reference/commands/clone) leverages each data platform's functionality for creating lightweight copies of dbt models from one environment into another. Useful when quickly spinning up a new development environment, or promoting specific models from a staging environment into production. + +### Multi-project collaboration + +[**Deprecation date**](/reference/resource-properties/deprecation_date): Models can declare a deprecation date that will warn model producers and downstream consumers. This enables clear migration windows for versioned models, and provides a mechanism to facilitate removal of immature or little-used models, helping to avoid project bloat. + +[Model names](/faqs/Models/unique-model-names) can be duplicated across different namespaces (projects/packages), so long as they are unique within each project/package. We strongly encourage using [two-argument `ref`](/reference/dbt-jinja-functions/ref#two-argument-variant) when referencing a model from a different package/project. + +More consistency and flexibility around packages. Resources defined in a package will respect variable and global macro definitions within the scope of that package. +- `vars` defined in a package's `dbt_project.yml` are now available in the resolution order when compiling nodes in that package, though CLI `--vars` and the root project's `vars` will still take precedence. See ["Variable Precedence"](/docs/build/project-variables#variable-precedence) for details. +- `generate_x_name` macros (defining custom rules for database, schema, alias naming) follow the same pattern as other "global" macros for package-scoped overrides. See [macro dispatch](/reference/dbt-jinja-functions/dispatch) for an overview of the patterns that are possible. + +:::caution Closed Beta - dbt Cloud Enterprise +[**Project dependencies**](/docs/collaborate/govern/project-dependencies): Introduces `dependencies.yml` and dependent `projects` as a feature of dbt Cloud Enterprise. Allows enforcing model access (public vs. protected/private) across project/package boundaries. Enables cross-project `ref` of public models, without requiring the installation of upstream source code. +::: + +### Quick hits + +- [`state:unmodified` and `state:old`](/reference/node-selection/methods#the-state-method) for [MECE](https://en.wikipedia.org/wiki/MECE_principle) stateful selection +- [`invocation_args_dict`](/reference/dbt-jinja-functions/flags#invocation_args_dict) includes full `invocation_command` as string +- [`dbt debug --connection`](/reference/commands/debug) to test just the data platform connection specified in a profile +- [`dbt docs generate --empty-catalog`](/reference/commands/cmd-docs) to skip catalog population while generating docs +- [`--defer-state`](/reference/node-selection/defer) enables more-granular control +- [`dbt ls`](/reference/commands/list) adds the Semantic model selection method to allow for `dbt ls -s "semantic_model:*"` and the ability to execute `dbt ls --resource-type semantic_model`. + diff --git a/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md b/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md new file mode 100644 index 00000000000..0c7fc7ebcad --- /dev/null +++ b/website/docs/guides/migration/versions/02-upgrading-to-v1.5.md @@ -0,0 +1,151 @@ +--- +title: "Upgrading to v1.5" +description: New features and changes in dbt Core v1.5 +--- + +dbt Core v1.5 is a feature release, with two significant additions: +1. [**Model governance**](/docs/collaborate/govern/about-model-governance) — access, contracts, versions — the first phase of [multi-project deployments](https://github.com/dbt-labs/dbt-core/discussions/6725) +2. A Python entry point for [**programmatic invocations**](/reference/programmatic-invocations), at parity with the CLI + +## Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) +- [Release schedule](https://github.com/dbt-labs/dbt-core/issues/6715) + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x, with the exception of any changes explicitly mentioned below. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### Behavior changes + +:::info Why changes to previous behavior? + +This release includes significant new features, and rework to `dbt-core`'s CLI and initialization flow. As part of refactoring its internals, we made a handful of changes to runtime configuration. The net result of these changes is more consistent & practical configuration options, and a more legible codebase. + +**_Wherever possible, we will provide backward compatibility and deprecation warnings for at least one minor version before actually removing the old functionality._** In those cases, we still reserve the right to fully remove backwards compatibility for deprecated functionality in a future v1.x minor version of `dbt-core`. + +::: + +Setting `log-path` and `target-path` in `dbt_project.yml` has been deprecated for consistency with other invocation-specific runtime configs ([dbt-core#6882](https://github.com/dbt-labs/dbt-core/issues/6882)). We recommend setting via env var or CLI flag instead. + +The `dbt list` command will now include `INFO` level logs by default. Previously, the `list` command (and _only_ the `list` command) had `WARN`-level stdout logging, to support piping its results to [`jq`](https://stedolan.github.io/jq/manual/), a file, or another process. To achieve that goal, you can use either of the following parameters: +- `dbt --log-level warn list` (recommended; equivalent to previous default) +- `dbt --quiet list` (suppresses all logging less than ERROR level, except for "printed" messages and `list` output) + +The following env vars have been renamed, for consistency with the convention followed by all other parameters: +- `DBT_DEFER_TO_STATE` → `DBT_DEFER` +- `DBT_FAVOR_STATE_MODE` → `DBT_FAVOR_STATE` +- `DBT_NO_PRINT` → `DBT_PRINT` +- `DBT_ARTIFACT_STATE_PATH` → `DBT_STATE` + +As described in [dbt-core#7169](https://github.com/dbt-labs/dbt-core/pull/7169), command-line parameters that could be silent before will no longer be silent. See [dbt-labs/dbt-core#7158](https://github.com/dbt-labs/dbt-core/issues/7158) and [dbt-labs/dbt-core#6800](https://github.com/dbt-labs/dbt-core/issues/6800) for more examples of the behavior we are fixing. + +An empty `tests:` key in a yaml file will now raise a validation error, instead of being silently skipped. You can resolve this by removing the empty `tests:` key, or by setting it to an empty list explicitly: +```yml +# ❌ this will raise an error +models: + - name: my_model + tests: + config: ... + +# ✅ this is fine +models: + - name: my_model + tests: [] # todo! add tests later + config: ... +``` + +Some options that could previously be specified _after_ a subcommand can now only be specified _before_. This includes the inverse of the option, `--write-json` and `--no-write-json`, for example. The list of affected options are: + +
          +List of affected options + +```bash +--cache-selected-only | --no-cache-selected-only +--debug, -d | --no-debug +--deprecated-print | --deprecated-no-print +--enable-legacy-logger | --no-enable-legacy-logger +--fail-fast, -x | --no-fail-fast +--log-cache-events | --no-log-cache-events +--log-format +--log-format-file +--log-level +--log-level-file +--log-path +--macro-debugging | --no-macro-debugging +--partial-parse | --no-partial-parse +--partial-parse-file-path +--populate-cache | --no-populate-cache +--print | --no-print +--printer-width +--quiet, -q | --no-quiet +--record-timing-info, -r +--send-anonymous-usage-stats | --no-send-anonymous-usage-stats +--single-threaded | --no-single-threaded +--static-parser | --no-static-parser +--use-colors | --no-use-colors +--use-colors-file | --no-use-colors-file +--use-experimental-parser | --no-use-experimental-parser +--version, -V, -v +--version-check | --no-version-check +--warn-error +--warn-error-options +--write-json | --no-write-json + +``` + +
          + + +Additionally, some options that could be previously specified _before_ a subcommand can now only be specified _after_. Any option _not_ in the above list must appear _after_ the subcommand from v1.5 and later. For example, `--profiles-dir`. + + +The built-in [collect_freshness](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/core/dbt/include/global_project/macros/adapters/freshness.sql) macro now returns the entire `response` object, instead of just the `table` result. If you're using a custom override for `collect_freshness`, make sure you're also returning the `response` object; otherwise, some of your dbt commands will never finish. For example: + +```sql +{{ return(load_result('collect_freshness')) }} +``` + +Finally: The [built-in `generate_alias_name` macro](https://github.com/dbt-labs/dbt-core/blob/1.5.latest/core/dbt/include/global_project/macros/get_custom_name/get_custom_alias.sql) now includes logic to handle versioned models. If your project has reimplemented the `generate_alias_name` macro with custom logic, and you want to start using [model versions](/docs/collaborate/govern/model-versions), you will need to update the logic in your macro. Note that, while this is **not** a prerequisite for upgrading to v1.5—only for using the new feature—we recommmend that you do this during your upgrade, whether you're planning to use model versions tomorrow or far in the future. + +### For consumers of dbt artifacts (metadata) + +The [manifest](/reference/artifacts/manifest-json) schema version will be updated to `v9`. Specific changes: +- Addition of `groups` as a top-level key +- Addition of `access`, `constraints`, `version`, `latest_version` as a top-level node attributes for models +- Addition of `constraints` as a column-level attribute +- Addition of `group` and `contract` as node configs +- To support model versions, the type of `refs` has changed from `List[List[str]]` to `List[RefArgs]`, with nested keys `name: str`, `package: Optional[str] = None`, and `version: Union[str, float, NoneType] = None)`. + +### For maintainers of adapter plugins + +For more detailed information and to ask questions, please read and comment on the GH discussion: [dbt-labs/dbt-core#7213](https://github.com/dbt-labs/dbt-core/discussions/7213). + +## New and changed documentation + +### Model governance + +The first phase of supporting dbt deployments at scale, across multiple projects with clearly defined ownership and interface boundaries. [Read about model governance](/docs/collaborate/govern/about-model-governance), all of which is new in v1.5. + +### Revamped CLI + +Compile and preview dbt models and `--inline` dbt-SQL queries on the CLI using: +- [`dbt compile`](/reference/commands/compile) +- [`dbt show`](/reference/commands/show) (new!) + +[Node selection methods](/reference/node-selection/methods) can use Unix-style wildcards to glob nodes matching a pattern: +``` +dbt ls --select "tag:team_*" +``` + +And (!): a first-ever entry point for [programmatic invocations](/reference/programmatic-invocations), at parity with CLI commands. + +Run `dbt --help` to see new & improved help documentation :) + +### Quick hits +- The [`version: 2` top-level key](/reference/project-configs/version) is now **optional** in all YAML files. Also, the [`config-version: 2`](/reference/project-configs/config-version) and `version:` top-level keys are now optional in `dbt_project.yml` files. +- [Events and logging](/reference/events-logging): Added `node_relation` (`database`, `schema`, `identifier`) to the `node_info` dictionary, available on node-specific events +- Support setting `--project-dir` via environment variable: [`DBT_PROJECT_DIR`](/reference/dbt_project.yml) +- More granular [configurations](/reference/global-configs/about-global-configs) for logging (to set log format, log levels, and colorization) and cache population diff --git a/website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md b/website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md new file mode 100644 index 00000000000..72c6fc3c968 --- /dev/null +++ b/website/docs/guides/migration/versions/03-upgrading-to-dbt-utils-v1.0.md @@ -0,0 +1,153 @@ +--- +title: "Upgrading to dbt utils v1.0" +description: New features and breaking changes to consider as you upgrade to dbt utils v1.0. +--- + +# Upgrading to dbt utils v1.0 + +For the first time, [dbt utils](https://hub.getdbt.com/dbt-labs/dbt_utils/latest/) is crossing the major version boundary. From [last month’s blog post](https://www.getdbt.com/blog/announcing-dbt-v1.3-and-utils/): + +> It’s time to formalize what was already unofficial policy: you can rely on dbt utils in the same way as you do dbt Core, with stable interfaces and consistent and intuitive naming. + +Just like the switch to dbt Core 1.0 last year, there are some breaking changes as we standardized and prepared for the future. Most changes can be handled with find-and-replace. If you need help, post on the [Community Forum](https://discourse.getdbt.com) or in [#package-ecosystem](https://getdbt.slack.com/archives/CU4MRJ7QB) channel on Slack. + +## New features + +- `get_single_value()` — An easy way to pull a single value from a SQL query, instead of having to access the `[0][0]`th element of a `run_query` result. +- `safe_divide()` — Returns null when the denominator is 0, instead of throwing a divide-by-zero error. +- New `not_empty_string` test — An easier wrapper than using `expression_is_true` to check the length of a column. + +## Enhancements + +- Many tests are more meaningful when you run them against subgroups of a table. For example, you may need to validate that recent data exists for every turnstile instead of a single data source being sufficient. Add the new `group_by_columns` argument to your tests to do so. Review [this article](https://www.emilyriederer.com/post/grouping-data-quality/) by the test's author for more information. +- With the addition of an on-by-default `quote_identifiers` argument in the `star()` macro, you can now disable quoting if necessary. +- The `recency` test now has an optional `ignore_time_component` argument which can be used when testing against a date column. This prevents the time of day the test runs from causing false negatives/positives. + +## Fixes + +- `union()` now includes/excludes columns case-insensitively +- `slugify()` prefixes an underscore when the first char is a digit +- The `expression_is_true` test doesn’t output `*` unless storing failures, a cost improvement for BigQuery. + +## Breaking Changes +### Changes to `surrogate_key()`: + +- `surrogate_key()` has been replaced by `generate_surrogate_key()`. The original treated null values and blank strings the same, which could lead to duplicate keys being created. `generate_surrogate_key()` does not have this flaw. Compare the [surrogate keys calculated for these columns](https://docs.google.com/spreadsheets/d/1qWfdbieUOSgkzdY0kmJ9iCgdqyWccA0R-6EW0EgaMQc/edit#gid=0): + +![A table comparing the behavior of surrogate_key and generate_surrogate_key](/img/guides/migration/versions/surrogate_key_behaviour.png) + +Changing the calculation method for surrogate keys, even for the better, could have significant consequences in downstream uses (such as snapshots and incremental models which use this column as their `unique_key`). As a result, it's possible to opt into the legacy behavior by setting the following variable in your dbt project: + +```yaml +#dbt_project.yml +vars: + surrogate_key_treat_nulls_as_empty_strings: true #turn on legacy behavior +``` + +By creating a new macro instead of updating the behavior of the old one, we are requiring all projects who use this macro to make an explicit decision about which approach is better for their context. + +**Our recommendation is that existing users should opt into the legacy behavior** unless you are confident that either: + +- your surrogate keys never contained nulls, or +- your surrogate keys are not used for incremental models, snapshots or other stateful artifacts and so can be regenerated with new values without issue. + +:::caution Warning to package maintainers + +You can not assume one behavior or the other, as each project can customize its behavior. + +::: + +### Functionality now native to dbt Core: +- The `expression_is_true` test no longer has a dedicated `condition` argument. Instead, use `where` which is [now available natively to all tests](https://docs.getdbt.com/reference/resource-configs/where): + +```yaml +version: 2 + +models: + - name: old_syntax + tests: + - dbt_utils.expression_is_true: + expression: "col_a + col_b = total" + #replace this... + condition: "created_at > '2018-12-31'" + + - name: new_syntax + tests: + - dbt_utils.expression_is_true: + expression: "col_a + col_b = total" + # ...with this... + where: "created_at > '2018-12-31'" +``` +**Note** — This may cause some tests to get the same autogenerated names. To resolve this, you can [define a custom name for a test](/reference/resource-properties/tests#define-a-custom-name-for-one-test). +- The deprecated `unique_where` and `not_null_where` tests have been removed, because [where is now available natively to all tests](https://docs.getdbt.com/reference/resource-configs/where). To migrate, find and replace `dbt_utils.unique_where` with `unique` and `dbt_utils.not_null_where` with `not_null`. +- `dbt_utils.current_timestamp()` is replaced by `dbt.current_timestamp()`. + - Note that Postgres and Snowflake’s implementation of `dbt.current_timestamp()` differs from the old `dbt_utils` one ([full details here](https://github.com/dbt-labs/dbt-utils/pull/597#issuecomment-1231074577)). If you use Postgres or Snowflake and need identical backwards-compatible behavior, use `dbt.current_timestamp_backcompat()`. This discrepancy will hopefully be reconciled in a future version of dbt Core. +- All other cross-db macros have moved to the dbt namespace, with no changes necessary other than replacing `dbt_utils.` with `dbt.`. Review the [cross database macros documentation](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros) for the full list. + - In your code editor, you can do a global find and replace with regex: `\{\{\s*dbt_utils\.(any_value|bool_or|cast_bool_to_text|concat|dateadd|datediff|date_trunc|escape_single_quotes|except|hash|intersect|last_day|length|listagg|position|replace|right|safe_cast|split_part|string_literal|type_bigint|type_float|type_int|type_numeric|type_string|type_timestamp|type_bigint|type_float|type_int|type_numeric|type_string|type_timestamp|except|intersect|concat|hash|length|position|replace|right|split_part|escape_single_quotes|string_literal|any_value|bool_or|listagg|cast_bool_to_text|safe_cast|dateadd|datediff|date_trunc|last_day)` → `{{ dbt.$1` +### Removal of `insert_by_period` materialization +- The `insert_by_period` materialization has been moved to the [experimental-features repo](https://github.com/dbt-labs/dbt-labs-experimental-features/tree/main/insert_by_period). To continue to use it, add the below to your packages.yml file: + +```yaml +packages: + - git: https://github.com/dbt-labs/dbt-labs-experimental-features + subdirectory: insert_by_period + revision: XXXX #optional but highly recommended. Provide a full git sha hash, e.g. 1c0bfacc49551b2e67d8579cf8ed459d68546e00. If not provided, uses the current HEAD. +``` +### Removal of deprecated legacy behavior: +- `safe_add()` only works with a list of arguments; use `{{ dbt_utils.safe_add(['column_1', 'column_2']) }}` instead of varargs `{{ dbt_utils.safe_add('column_1', 'column_2') }}`. +- Several long-promised deprecations to `deduplicate()` have been applied: + - The `group_by` argument is replaced by `partition_by`. + - `relation_alias` is removed. If you need an alias, you can pass it directly to the `relation` argument. + - `order_by` is now mandatory. Pass a static value like `1` if you don’t care how they are deduplicated. +- The deprecated `table` argument has been removed from `unpivot()`. Use `relation` instead. + + +## Resolving error messages +After upgrading, these are common error messages you may encounter, along with their resolutions. +
          + dict object has no attribute MACRO_NAME +
          +

          Cause: No macro called MACRO_NAME exists. This is most likely because the macro has moved to the dbt namespace (see above). It could also be because you haven't run dbt deps or have misspelled a macro's name.

          +

          Resolution: For cross-database macros, change dbt_utils.MACRO_NAME() to dbt.MACRO_NAME().

          +
          +
          +
          + macro 'dbt_macro__generate_surrogate_key' takes not more than 1 argument(s) +
          +

          Cause: generate_surrogate_key() requires a single argument containing a list of columns, not a set of varargs.

          +

          Resolution: Change to dbt_utils.generate_surrogate_key(['column_1', 'column_2']) - note the square brackets.

          +
          +
          +
          + The dbt_utils.surrogate_key has been replaced by dbt_utils.generate_surrogate_key +
          +

          Cause: surrogate_key() has been replaced.

          +

          Resolution: +

            +
          1. Decide whether you need to enable backwards compatibility as detailed above.
          2. +
          3. Find and replace dbt_utils.surrogate_key with dbt_utils.generate_surrogate_key.
          4. +
          +

          +
          +
          +
          + macro dbt_macro__test_expression_is_true takes no keyword argument condition +
          +

          Cause: condition has been removed from the expression_is_true test, now that where is available on all tests automatically.

          +

          Resolution: Replace condition with where.

          +
          +
          +
          + No materialization insert_by_period was found for adapter +
          +

          Cause: insert_by_period has moved to the experimental features repo (see above).

          +

          Resolution: Install the package as described above.

          +
          +
          +
          + dbt found two tests with the name "XXX". +
          +

          Cause: Changing from condition to where in the expression_is_true test, as configs are not part of a test's unique name.

          +

          Resolution: Define a custom name for your test.

          +
          +
          diff --git a/website/docs/guides/migration/versions/04-upgrading-to-v1.4.md b/website/docs/guides/migration/versions/04-upgrading-to-v1.4.md new file mode 100644 index 00000000000..3537eb1677a --- /dev/null +++ b/website/docs/guides/migration/versions/04-upgrading-to-v1.4.md @@ -0,0 +1,59 @@ +--- +title: "Upgrading to v1.4" +description: New features and changes in dbt Core v1.4 +--- +### Resources + +- [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.4.latest/CHANGELOG.md) +- [CLI Installation guide](/docs/core/installation) +- [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) + +**Final release:** January 25, 2023 + +dbt Core v1.4 is a "behind-the-scenes" release. We've been hard at work rebuilding `dbt-core` internals on top of more-solid foundations, to enable an exciting year of new feature development. Check out the [v1.5 milestone](https://github.com/dbt-labs/dbt-core/milestone/82) in GitHub for a preview of what's planned for April. + +## What to know before upgrading + +dbt Labs is committed to providing backward compatibility for all versions 1.x. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). + +### For consumers of dbt artifacts (metadata) + +The manifest schema version has updated to `v8`. These changes are relevant for people who parse or analyze the contents of the `manifest.json` file, or who have custom code accessing the [`model`](https://docs.getdbt.com/reference/dbt-jinja-functions/model) or [`graph`](https://docs.getdbt.com/reference/dbt-jinja-functions/graph) variables, e.g. `{{ model.root_path }}`. + +Relevant changes: +- The `root_path` attribute has been removed for non-seed nodes to reduce duplicative information. +- Unused attributes have been removed from seed nodes (including `depends_on.nodes`), and from `macros` (including `tags`). +- The `unique_id` of docs blocks now start with `doc` for consistency with other resource types. + +### For maintainers of adapter plugins + +> **TL;DR** Not much heavy lifting for this minor version. We anticipate more work for `1.5.0`. We plan to release betas early & often, and provide guidance on upgrading. + +The high-level changes are: +- Add support for Python 3.11 +- Rename/replace deprecated exception functions +- Add support for Incremental Predicates (if applicable) +- Make use of new adapter-zone tests + +For more detailed information and to ask any questions, please visit [dbt-core/discussions/6624](https://github.com/dbt-labs/dbt-core/discussions/6624). + +## New and changed documentation + +- [**Events and structured logging**](/reference/events-logging): dbt's event system got a makeover. Expect more consistency in the availability and structure of information, backed by type-safe event schemas. +- [**Python support**](/faqs/Core/install-python-compatibility): Python 3.11 was released in October 2022. It is officially supported in dbt-core v1.4, although full support depends also on the adapter plugin for your data platform. According to the Python maintainers, "Python 3.11 is between 10-60% faster than Python 3.10." We encourage you to try [`dbt parse`](/reference/commands/parse) with dbt Core v1.4 + Python 3.11, and compare the timing with dbt Core v1.3 + Python 3.10. Let us know what you find! +- [**Metrics**](/docs/build/metrics): `time_grain` is optional, to provide better ergonomics around metrics that aren't time-bound. +- **dbt-Jinja context:** The [local_md5](/reference/dbt-jinja-functions/local_md5) context method will calculate an [MD5 hash](https://en.wikipedia.org/wiki/MD5) for use _within_ dbt. (Not to be confused with SQL md5!) +- [**Exposures**](/docs/build/exposures) can now depend on `metrics`. +- [**"Tarball" packages**](/docs/build/packages#internally-hosted-tarball-URL): Some organizations have security requirements to pull resources only from internal services. To address the need to install packages from hosted environments (such as Artifactory or cloud storage buckets), it's possible to specify any accessible URL where a compressed dbt package can be downloaded. +- [**Granular "warn error" configuration**](/reference/global-configs/warnings): Thanks to a full cleanup and consolidation of warning and exception classes within `dbt-core`, it is now possible to define a more granular `--warn-error-options` configuration that specifies the exact warnings you do (or don't) want dbt to treat as errors. +- [**Deferral**](/reference/node-selection/defer#favor-state) supports an optional configuration, `--favor-state`. + +### Advanced configurations for incremental models + +- [**`incremental_predicates`** config](/docs/build/incremental-models#about-incremental_predicates) is now supported on the most popular adapters, enabling greater flexibility when tuning performance in `merge` and `delete` statements against large datasets. +- **BigQuery:** The `insert_overwrite` incremental strategy supports a new (old) mechanism, [`time_ingestion_partitioning`](/reference/resource-configs/bigquery-configs#partitioning-by-an-ingestion-date-or-timestamp) + [`copy_partitions`](#copying-ingestion-time-partitions), that can yield significant savings in cost + time for large datasets. + +### Updates to Python models + +- Python models are [configured to materialize](/docs/build/python-models) as `table` by default. +- Python models [running on Snowpark](/docs/build/python-models) will use "anonymous" stored procedures by default, enabling a small speedup and a cleaner query history. diff --git a/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md b/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md index 3d687da9f9d..5fdf559a267 100644 --- a/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md +++ b/website/docs/guides/migration/versions/05-upgrading-to-v1.3.md @@ -1,20 +1,21 @@ --- -title: "Upgrading to v1.3 (latest)" +title: "Upgrading to v1.3" +description: New features and changes in dbt Core v1.3 --- ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.3.latest/CHANGELOG.md) -- [CLI Installation guide](/docs/get-started/installation) +- [CLI Installation guide](/docs/core/installation) - [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) -## Breaking changes +## What to know before upgrading We are committed to providing backward compatibility for all versions 1.x. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). There are three changes in dbt Core v1.3 that may require action from some users: -1. If you have a `profiles.yml` file located in the root directory where you run dbt, dbt will start preferring that profiles file over the default location on your machine. [You can read more details here](/docs/get-started/connection-profiles#advanced-customizing-a-profile-directory). -2. If you already have `.py` files defined in the `model-paths` of your dbt project, dbt will start trying to read them as Python models. You can use [the new `.dbtignore` file](dbtignore) to tell dbt to ignore those files. -3. If you have custom code accessing the `raw_sql` property of models (with the [model](dbt-jinja-functions/model) or [graph](/reference/dbt-jinja-functions/graph) objects), it has been renamed to `raw_code`. This is a change to the manifest contract, described in more detail below. +1. If you have a `profiles.yml` file located in the root directory where you run dbt, dbt will start preferring that profiles file over the default location on your machine. [You can read more details here](/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory). +2. If you already have `.py` files defined in the `model-paths` of your dbt project, dbt will start trying to read them as Python models. You can use [the new `.dbtignore` file](/reference/dbtignore) to tell dbt to ignore those files. +3. If you have custom code accessing the `raw_sql` property of models (with the [model](/reference/dbt-jinja-functions/model) or [graph](/reference/dbt-jinja-functions/graph) objects), it has been renamed to `raw_code`. This is a change to the manifest contract, described in more detail below. ### For users of dbt Metrics @@ -32,26 +33,23 @@ We have updated the manifest schema version to `v7`. This includes the changes t - Renamed `compiled_sql` to `compiled_code` - A new top-level node property, `language` (`'sql'` or `'python'`) -For users of [state-based selection](understanding-state): This release includes logic providing backward and forward compatibility for older manifest versions. While running dbt Core v1.3, it should be possible to use `state:modified --state ...` selection against a manifest produced by dbt Core v1.0 and higher. +For users of [state-based selection](/reference/node-selection/syntax#about-node-selection): This release includes logic providing backward and forward compatibility for older manifest versions. While running dbt Core v1.3, it should be possible to use `state:modified --state ...` selection against a manifest produced by dbt Core v1.0 and higher. ### For maintainers of adapter plugins -_GitHub discussion forthcoming_ - -**Notes:** -- The `statement` and `create_table_as` macros accept a new argument, `language`, with a default value of `'sql'` +GitHub discussion with details: [dbt-labs/dbt-core#6011](https://github.com/dbt-labs/dbt-core/discussions/6011) ## New and changed documentation -- **[Python models](building-models/python-models)** are natively supported in `dbt-core` for the first time, on data warehouses that support Python runtimes. -- Updates made to **[Metrics](build/metrics)** reflect their new syntax for definition, as well as additional properties that are now available. -- Plus, a few related updates to **[exposure properties](exposure-properties)**: `config`, `label`, and `name` validation. +- **[Python models](/docs/build/python-models)** are natively supported in `dbt-core` for the first time, on data warehouses that support Python runtimes. +- Updates made to **[Metrics](/docs/build/metrics)** reflect their new syntax for definition, as well as additional properties that are now available. +- Plus, a few related updates to **[exposure properties](/reference/exposure-properties)**: `config`, `label`, and `name` validation. - **[Custom `node_color`](/reference/resource-configs/docs.md)** in `dbt-docs`. For the first time, you can control the colors displayed in dbt's DAG. Want bronze, silver, and gold layers? It's at your fingertips. -- **[`Profiles.yml`](/docs/get-started/connection-profiles#advanced-customizing-a-profile-directory)** search order now looks in the current working directory before `~/.dbt`. +- **[`Profiles.yml`](/docs/core/connect-data-platform/connection-profiles#advanced-customizing-a-profile-directory)** search order now looks in the current working directory before `~/.dbt`. ### Quick hits -- **["Full refresh"](full_refresh)** flag supports a short name, `-f`. -- **[The "config" selection method](methods#the-config-method)** supports boolean and list config values, in addition to strings. -- Two new dbt-Jinja context variables for accessing invocation metadata: [`invocation_args_dict`](flags#invocation_args_dict) and [`dbt_metadata_envs`](env_var#custom-metadata). +- **["Full refresh"](/reference/resource-configs/full_refresh)** flag supports a short name, `-f`. +- **[The "config" selection method](/reference/node-selection/methods#the-config-method)** supports boolean and list config values, in addition to strings. +- Two new dbt-Jinja context variables for accessing invocation metadata: [`invocation_args_dict`](/reference/dbt-jinja-functions/flags#invocation_args_dict) and [`dbt_metadata_envs`](/reference/dbt-jinja-functions/env_var#custom-metadata). diff --git a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md b/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md index 51f1f8d0a01..91ffadf9093 100644 --- a/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md +++ b/website/docs/guides/migration/versions/06-upgrading-to-v1.2.md @@ -1,13 +1,14 @@ --- title: "Upgrading to v1.2" +description: New features and changes in dbt Core v1.2 --- ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.2.latest/CHANGELOG.md) -- [CLI Installation guide](/docs/get-started/installation) +- [CLI Installation guide](/docs/core/installation) - [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) -## Breaking changes +## What to know before upgrading There are no breaking changes for code in dbt projects and packages. We are committed to providing backwards compatibility for all versions 1.x. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). @@ -17,7 +18,7 @@ The manifest schema version has been updated to `v6`. The relevant changes are: - Change to `config` default, which includes a new `grants` property with default value `{}` - Addition of a `metrics` property, to any node which could reference metrics using the `metric()` function -For users of [state-based selection](understanding-state): This release also includes new logic declaring forwards compatibility for older manifest versions. While running dbt Core v1.2, it should be possible to use `state:modified --state ...` selection against a manifest produced by dbt Core v1.0 or v1.1. +For users of [state-based selection](/reference/node-selection/syntax#about-node-selection): This release also includes new logic declaring forwards compatibility for older manifest versions. While running dbt Core v1.2, it should be possible to use `state:modified --state ...` selection against a manifest produced by dbt Core v1.0 or v1.1. ## For maintainers of adapter plugins @@ -25,12 +26,12 @@ See GitHub discussion [dbt-labs/dbt-core#5468](https://github.com/dbt-labs/dbt-c ## New and changed functionality -- **[Grants](resource-configs/grants)** are natively supported in `dbt-core` for the first time. That support extends to all standard materializations, and the most popular adapters. If you already use hooks to apply simple grants, we encourage you to use built-in `grants` to configure your models, seeds, and snapshots instead. This will enable you to [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) up your duplicated or boilerplate code. +- **[Grants](/reference/resource-configs/grants)** are natively supported in `dbt-core` for the first time. That support extends to all standard materializations, and the most popular adapters. If you already use hooks to apply simple grants, we encourage you to use built-in `grants` to configure your models, seeds, and snapshots instead. This will enable you to [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) up your duplicated or boilerplate code. - **[Metrics](/docs/build/metrics)** now support an `expression` type (metrics-on-metrics), as well as a `metric()` function to use when referencing metrics from within models, macros, or `expression`-type metrics. For more information on how to use expression metrics, check out the [**`dbt_metrics` package**](https://github.com/dbt-labs/dbt_metrics) -- **[dbt-Jinja functions](/reference/dbt-jinja-functions)** now include the [`itertools` Python module](dbt-jinja-functions/modules#itertools), as well as the [set](dbt-jinja-functions/set) and [zip](dbt-jinja-functions/zip) functions. -- **[Node selection](node-selection/syntax)** includes a [file selection method](node-selection/methods#the-file-method) (`-s model.sql`), and [yaml selector](node-selection/yaml-selectors) inheritance. -- **[Global configs](global-configs)** now include CLI flag and environment variable settings for [`target-path`](target-path) and [`log-path`](log-path), which can be used to override the values set in `dbt_project.yml` +- **[dbt-Jinja functions](/reference/dbt-jinja-functions)** now include the [`itertools` Python module](/reference/dbt-jinja-functions/modules#itertools), as well as the [set](/reference/dbt-jinja-functions/set) and [zip](/reference/dbt-jinja-functions/zip) functions. +- **[Node selection](/reference/node-selection/syntax)** includes a [file selection method](/reference/node-selection/methods#the-file-method) (`-s model.sql`), and [yaml selector](/reference/node-selection/yaml-selectors) inheritance. +- **[Global configs](/reference/global-configs/about-global-configs)** now include CLI flag and environment variable settings for [`target-path`](/reference/project-configs/target-path) and [`log-path`](/reference/project-configs/log-path), which can be used to override the values set in `dbt_project.yml` ### Specific adapters -- [Postgres](/reference/warehouse-setups/postgres-setup) and [Redshift](/reference/warehouse-setups/redshift-setup) profiles support a `retries` config, if dbt encounters an operational error or timeout when opening a connection. The default is 1 retry. +- [Postgres](/docs/core/connect-data-platform/postgres-setup) and [Redshift](/docs/core/connect-data-platform/redshift-setup) profiles support a `retries` config, if dbt encounters an operational error or timeout when opening a connection. The default is 1 retry. diff --git a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md b/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md index 382fb2e5d7a..131ecc97657 100644 --- a/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md +++ b/website/docs/guides/migration/versions/07-upgrading-to-v1.1.md @@ -1,19 +1,20 @@ --- title: "Upgrading to v1.1" +description: New features and changes in dbt Core v1.1 --- ### Resources - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.1.latest/CHANGELOG.md) -- [CLI Installation guide](/docs/get-started/installation) +- [CLI Installation guide](/docs/core/installation) - [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) -## Breaking changes +## What to know before upgrading There are no breaking changes for code in dbt projects and packages. We are committed to providing backwards compatibility for all versions 1.x. If you encounter an error upon upgrading, please let us know by [opening an issue](https://github.com/dbt-labs/dbt-core/issues/new). ### For maintainers of adapter plugins -We have reworked the testing suite for adapter plugin functionality. For details on the new testing suite, see: [Testing a new adapter](/guides/advanced/adapter-development/4-testing-a-new-adapter). +We have reworked the testing suite for adapter plugin functionality. For details on the new testing suite, see: [Testing a new adapter](/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter). The abstract methods `get_response` and `execute` now only return `connection.AdapterReponse` in type hints. Previously, they could return a string. We encourage you to update your methods to return an object of class `AdapterResponse`, or implement a subclass specific to your adapter. This also gives you the opportunity to add fields specific to your adapter's query execution, such as `rows_affected` or `bytes_processed`. @@ -21,7 +22,7 @@ The abstract methods `get_response` and `execute` now only return `connection.Ad The manifest schema version will be updated to v5. The only change is to the default value of `config` for parsed nodes. -For users of [state-based functionality](understanding-state), such as the `state:modified` selector, recall that: +For users of [state-based functionality](/reference/node-selection/syntax#about-node-selection), such as the `state:modified` selector, recall that: > The `--state` artifacts must be of schema versions that are compatible with the currently running dbt version. @@ -35,30 +36,30 @@ Expected a schema version of "https://schemas.getdbt.com/dbt/manifest/v5.json" i [**Incremental models**](/docs/build/incremental-models) can now accept a list of multiple columns as their `unique_key`, for models that need a combination of columns to uniquely identify each row. This is supported by the most common data warehouses, for incremental strategies that make use of the `unique_key` config (`merge` and `delete+insert`). -[**Generic tests**](resource-properties/tests) can define custom names. This is useful to "prettify" the synthetic name that dbt applies automatically. It's needed to disambiguate the case when the same generic test is defined multiple times with different configurations. +[**Generic tests**](/reference/resource-properties/tests) can define custom names. This is useful to "prettify" the synthetic name that dbt applies automatically. It's needed to disambiguate the case when the same generic test is defined multiple times with different configurations. -[**Sources**](source-properties) can define configuration inline with other `.yml` properties, just like other resource types. The only supported config is `enabled`; you can use this to dynamically enable/disable sources based on environment or package variables. +[**Sources**](/reference/source-properties) can define configuration inline with other `.yml` properties, just like other resource types. The only supported config is `enabled`; you can use this to dynamically enable/disable sources based on environment or package variables. ### Advanced and experimental functionality -**Fresh Rebuilds.** There's a new _experimental_ selection method in town: [`source_status:fresher`](node-selection/methods#the-source_status-method). Much like the `state:` and `result` methods, the goal is to use dbt metadata to run your DAG more efficiently. If dbt has access to previous and current results of `dbt source freshness` (the `sources.json` artifact), dbt can compare them to determine which sources have loaded new data, and select only resources downstream of "fresher" sources. Read more in [Understanding State](understanding-state) and [CI/CD in dbt Cloud](/docs/deploy/cloud-ci-job). +**Fresh Rebuilds.** There's a new _experimental_ selection method in town: [`source_status:fresher`](/reference/node-selection/methods#the-source_status-method). Much like the `state:` and `result` methods, the goal is to use dbt metadata to run your DAG more efficiently. If dbt has access to previous and current results of `dbt source freshness` (the `sources.json` artifact), dbt can compare them to determine which sources have loaded new data, and select only resources downstream of "fresher" sources. Read more in [Understanding State](/reference/node-selection/syntax#about-node-selection) and [CI/CD in dbt Cloud](/docs/deploy/continuous-integration). [**dbt-Jinja functions**](/reference/dbt-jinja-functions) have a new landing page, and two new members: - [`print`](/reference/dbt-jinja-functions/print) exposes the Python `print()` function. It can be used as an alternative to `log()`, and together with the `QUIET` config, for advanced macro-driven workflows. - [`selected_resources`](/reference/dbt-jinja-functions/selected_resources) exposes, at runtime, the list of DAG nodes selected by the current task. -[**Global configs**](global-configs) include some new additions: +[**Global configs**](/reference/global-configs/about-global-configs) include some new additions: - `QUIET` and `NO_PRINT`, to control which log messages dbt prints to terminal output. For use in advanced macro-driven workflows, such as [codegen](https://hub.getdbt.com/dbt-labs/codegen/latest/). - `CACHE_SELECTED_ONLY` is an _experimental_ config that can significantly speed up dbt's start-of-run preparations, in cases where you're running only a few models from a large project that manages many schemas. ### For users of specific adapters -**dbt-bigquery** added Support for finer-grained configuration of query timeout and retry when defining your [connection profile](/reference/warehouse-setups/bigquery-setup). +**dbt-bigquery** added Support for finer-grained configuration of query timeout and retry when defining your [connection profile](/docs/core/connect-data-platform/bigquery-setup). -**dbt-spark** added support for a [`session` connection method](/reference/warehouse-setups/spark-setup#session), for use with a pySpark session, to support rapid iteration when developing advanced or experimental functionality. This connection method is not recommended for new users, and it is not supported in dbt Cloud. +**dbt-spark** added support for a [`session` connection method](/docs/core/connect-data-platform/spark-setup#session), for use with a pySpark session, to support rapid iteration when developing advanced or experimental functionality. This connection method is not recommended for new users, and it is not supported in dbt Cloud. ### Dependencies -[Python compatibility](install-python-compatibility): dbt Core officially supports Python 3.10 +[Python compatibility](/faqs/Core/install-python-compatibility): dbt Core officially supports Python 3.10 diff --git a/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md b/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md index de8a5690521..9fc7991c087 100644 --- a/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md +++ b/website/docs/guides/migration/versions/08-upgrading-to-v1.0.md @@ -1,34 +1,35 @@ --- title: "Upgrading to v1.0" +description: New features and changes in dbt Core v1.0 --- ### Resources - [Discourse](https://discourse.getdbt.com/t/3180) - [Changelog](https://github.com/dbt-labs/dbt-core/blob/1.0.latest/CHANGELOG.md) -- [CLI Installation guide](/docs/get-started/installation) +- [CLI Installation guide](/docs/core/installation) - [Cloud upgrade guide](/docs/dbt-versions/upgrade-core-in-cloud) -## Breaking changes +## What to know before upgrading dbt Core major version 1.0 includes a number of breaking changes! Wherever possible, we have offered backwards compatibility for old behavior, and (where necessary) made migration simple. ### Renamed fields in `dbt_project.yml` **These affect everyone:** -- [model-paths](model-paths) have replaced `source-paths` in `dbt-project.yml. -- [seed-paths](seed-paths) have replaced `data-paths` in `dbt-project.yml with a default value of `seeds`. -- The [packages-install-path](packages-install-path) was updated from `modules-path`. Additionally the default value is now `dbt_packages` instead of `dbt_modules`. You may need to update this value in [`clean-targets`](clean-targets). -- Default for `quote-columns` is now `True` for all adapters other than Snowflake. +- [model-paths](/reference/project-configs/model-paths) have replaced `source-paths` in `dbt-project.yml`. +- [seed-paths](/reference/project-configs/seed-paths) have replaced `data-paths` in `dbt-project.yml` with a default value of `seeds`. +- The [packages-install-path](/reference/project-configs/packages-install-path) was updated from `modules-path`. Additionally the default value is now `dbt_packages` instead of `dbt_modules`. You may need to update this value in [`clean-targets`](/reference/project-configs/clean-targets). +- Default for `quote_columns` is now `True` for all adapters other than Snowflake. **These probably don't:** -- The default value of [test-paths](test-paths) has been updated to be the plural `tests`. -- The default value of [analysis-paths](analysis-paths) has been updated to be the plural `analyses`. +- The default value of [test-paths](/reference/project-configs/test-paths) has been updated to be the plural `tests`. +- The default value of [analysis-paths](/reference/project-configs/analysis-paths) has been updated to be the plural `analyses`. ### Tests -The two **test types** are now "singular" and "generic" (instead of "data" and "schema", respectively). The `test_type:` selection method accepts `test_type:singular` and `test_type:generic`. (It will also accept `test_type:schema` and `test_type:data` for backwards compatibility.) **Not backwards compatible:** The `--data` and `--schema` flags to dbt test are no longer supported, and tests no longer have the tags `'data'` and `'schema'` automatically applied. Updated docs: [tests](/docs/build/tests), [test selection](test-selection-examples), [selection methods](node-selection/methods). +The two **test types** are now "singular" and "generic" (instead of "data" and "schema", respectively). The `test_type:` selection method accepts `test_type:singular` and `test_type:generic`. (It will also accept `test_type:schema` and `test_type:data` for backwards compatibility.) **Not backwards compatible:** The `--data` and `--schema` flags to dbt test are no longer supported, and tests no longer have the tags `'data'` and `'schema'` automatically applied. Updated docs: [tests](/docs/build/tests), [test selection](/reference/node-selection/test-selection-examples), [selection methods](/reference/node-selection/methods). -The `greedy` flag/property has been renamed to **`indirect_selection`**, which is now eager by default. **Note:** This reverts test selection to its pre-v0.20 behavior by default. `dbt test -s my_model` _will_ select multi-parent tests, such as `relationships`, that depend on unselected resources. To achieve the behavior change in v0.20 + v0.21, set `--indirect-selection=cautious` on the CLI or `indirect_selection: cautious` in yaml selectors. Updated docs: [test selection examples](test-selection-examples), [yaml selectors](yaml-selectors) +The `greedy` flag/property has been renamed to **`indirect_selection`**, which is now eager by default. **Note:** This reverts test selection to its pre-v0.20 behavior by default. `dbt test -s my_model` _will_ select multi-parent tests, such as `relationships`, that depend on unselected resources. To achieve the behavior change in v0.20 + v0.21, set `--indirect-selection=cautious` on the CLI or `indirect_selection: cautious` in YAML selectors. Updated docs: [test selection examples](/reference/node-selection/test-selection-examples), [yaml selectors](/reference/node-selection/yaml-selectors). ### Global macros @@ -43,29 +44,29 @@ Global project macros have been reorganized, and some old unused macros have bee ### For users of adapter plugins -- **BigQuery:** Support for [ingestion-time-partitioned tables](creating-date-partitioned-tables) has been officially deprecated in favor of modern approaches. Use `partition_by` and incremental modeling strategies instead. +- **BigQuery:** Support for [ingestion-time-partitioned tables](/guides/legacy/creating-date-partitioned-tables) has been officially deprecated in favor of modern approaches. Use `partition_by` and incremental modeling strategies instead. ### For maintainers of plugins + other integrations -We've introduced a new [**structured event interface**](events-logging), and we've transitioned all dbt logging to use this new system. **This includes a breaking change for adapter plugins**, requiring a very simple migration. For more details, see the [`events` module README](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/events/README.md#adapter-maintainers). If you maintain a different kind of plugin that _needs_ legacy logging, for the time being, you can re-enable it with an env var (`DBT_ENABLE_LEGACY_LOGGER=True`); be advised that we will remove this capability in a future version of dbt Core. +We've introduced a new [**structured event interface**](/reference/events-logging), and we've transitioned all dbt logging to use this new system. **This includes a breaking change for adapter plugins**, requiring a very simple migration. For more details, see the [`events` module README](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/events/README.md#adapter-maintainers). If you maintain a different kind of plugin that _needs_ legacy logging, for the time being, you can re-enable it with an env var (`DBT_ENABLE_LEGACY_LOGGER=True`); be advised that we will remove this capability in a future version of dbt Core. -The [**dbt RPC Server**](rpc) has been split out from `dbt-core` and is now packaged separately. Its functionality will be fully deprecated by the end of 2022, in favor of a new dbt Server. Instead of `dbt rpc`, use `dbt-rpc serve`. +The [**dbt RPC Server**](/reference/commands/rpc) has been split out from `dbt-core` and is now packaged separately. Its functionality will be fully deprecated by the end of 2022, in favor of a new dbt Server. Instead of `dbt rpc`, use `dbt-rpc serve`. **Artifacts:** New schemas (manifest v4, run results v4, sources v3). Notable changes: add `metrics` nodes; schema test + data test nodes are renamed to generic test + singular test nodes; freshness threshold default values look slightly different. ### Deprecations from long ago Several under-the-hood changes from past minor versions, tagged with deprecation warnings, have now been fully deprecated. -- The `packages` argument of [dispatch](dispatch) has been deprecated and will raise an exception when used. -- The "adapter_macro" macro has been deprecated. Instead, use the [dispatch](dispatch) method to find a macro and call the result. +- The `packages` argument of [dispatch](/reference/dbt-jinja-functions/dispatch) has been deprecated and will raise an exception when used. +- The "adapter_macro" macro has been deprecated. Instead, use the [dispatch](/reference/dbt-jinja-functions/dispatch) method to find a macro and call the result. - The `release` arg has been removed from the `execute_macro` method. ## New features and changed documentation -- Add [metrics](metrics), a new node type -- [Generic tests](custom-generic-tests) can be defined in `tests/generic` (new), in addition to `macros/` (as before) -- [Parsing](parsing): partial parsing and static parsing have been turned on by default. -- [Global configs](global-configs) have been standardized. Related updates to [global CLI flags](global-cli-flags) and [`profiles.yml`](profiles.yml). -- [The `init` command](init) has a whole new look and feel. It's no longer just for first-time users. +- Add [metrics](/docs/build/metrics), a new node type +- [Generic tests](/guides/best-practices/writing-custom-generic-tests) can be defined in `tests/generic` (new), in addition to `macros/` (as before) +- [Parsing](/reference/parsing): partial parsing and static parsing have been turned on by default. +- [Global configs](/reference/global-configs/about-global-configs) have been standardized. Related updates to [global CLI flags](/reference/global-cli-flags) and [`profiles.yml`](/docs/core/connect-data-platform/profiles.yml). +- [The `init` command](/reference/commands/init) has a whole new look and feel. It's no longer just for first-time users. - Add `result:` subselectors for smarter reruns when dbt models have errors and tests fail. See examples: [Pro-tips for Workflows](/guides/legacy/best-practices#pro-tips-for-workflows) -- Secret-prefixed [env vars](env_var) are now allowed only in `profiles.yml` + `packages.yml` +- Secret-prefixed [env vars](/reference/dbt-jinja-functions/env_var) are now allowed only in `profiles.yml` + `packages.yml` diff --git a/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md b/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md index b25abad32f2..e5fbdf3fc7c 100644 --- a/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md +++ b/website/docs/guides/migration/versions/09-upgrading-to-v0.21.md @@ -4,7 +4,7 @@ title: "Upgrading to v0.21" --- :::caution Unsupported version -dbt Core v0.21 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](core-versions) for more details. +dbt Core v0.21 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](/docs/dbt-versions/core) for more details. ::: ### Resources @@ -18,33 +18,33 @@ dbt Core v0.21 has reached the end of critical support. No new patch versions wi - `dbt source snapshot-freshness` has been renamed to `dbt source freshness`. Its node selection logic is now consistent with other tasks. In order to check freshness for a specific source, you must prefix it with `source:`. - **Snowflake:** Turn off transactions and turn on `autocommit` by default. Within dbt materializations, wrap [DML statements](https://stackoverflow.com/a/44796508) in explicit `begin` and `commit`. Note that it is not recommended to run statements outside of dbt logic. If you do this, despite our recommendation, you will need to wrap those statements in explicit `begin` and `commit`. Note also that this may affect user-space code that depends on transactions, such as pre-hooks and post-hooks that specify `transaction: true` or `transaction: false`. We recommend removing those references to transactions. - **Artifacts:** - - [`manifest.json`](manifest-json) uses a `v3` schema that includes additional node properties (no changes to existing properties) - - [`run_results.json`](run-results-json) uses a `v3` schema that has added `skipped` as a potential `TestResult` - - [`sources.json`](sources-json) has a new `v2` schema that has added timing and thread details + - [`manifest.json`](/reference/artifacts/manifest-json) uses a `v3` schema that includes additional node properties (no changes to existing properties) + - [`run_results.json`](/reference/artifacts/run-results-json) uses a `v3` schema that has added `skipped` as a potential `TestResult` + - [`sources.json`](/reference/artifacts/sources-json) has a new `v2` schema that has added timing and thread details ## New and changed documentation ### Tasks -- [Commands](dbt-commands), [`build`](commands/build), [rpc](rpc): Add `dbt build` -- [Commands: `source`](commands/source): Renamed to `dbt source freshness`. -- [`deps`](commands/deps): Add `dbt deps` logging for outdated packages -- [`list`](commands/list): Add `--output-keys` flag and RPC parameter +- [Commands](/reference/dbt-commands), [`build`](/reference/commands/build), [rpc](/reference/commands/rpc): Add `dbt build` +- [Commands: `source`](/reference/commands/source): Renamed to `dbt source freshness`. +- [`deps`](/reference/commands/deps): Add `dbt deps` logging for outdated packages +- [`list`](/reference/commands/list): Add `--output-keys` flag and RPC parameter ## Selection -- [Commands: `source`](commands/source): Updated selection logic to match other tasks. When selecting a specific source to check freshness, you must prefix it with `source:`. -- [Node selection syntax](node-selection/syntax), [commands](dbt-commands): Switch `--models` for `--select` across the board. (Commands which previously used the `--models` flag still support it for backwards compatibility.) -- [YAML selectors](yaml-selectors#default) now support an optional `default` property. If set, dbt will use custom selection criteria for commands that do not specify their own selection/exclusion flags. -- [Selection methods](node-selection/methods) and [state comparison caveats](state-comparison-caveats): Add `state:modified` subselectors, and reflect that it now includes changes to upstream macros. -- [Test selection examples](test-selection-examples) includes more discussion of indirect selection (a change in v0.20), and the optional "greedy" flag/property (new in v0.21), which you can optionally set to include tests that have a mix of selected + unselected parents +- [Commands: `source`](/reference/commands/source): Updated selection logic to match other tasks. When selecting a specific source to check freshness, you must prefix it with `source:`. +- [Node selection syntax](/reference/node-selection/syntax), [commands](/reference/dbt-commands): Switch `--models` for `--select` across the board. (Commands which previously used the `--models` flag still support it for backwards compatibility.) +- [YAML selectors](/reference/node-selection/yaml-selectors#default) now support an optional `default` property. If set, dbt will use custom selection criteria for commands that do not specify their own selection/exclusion flags. +- [Selection methods](/reference/node-selection/methods) and [state comparison caveats](/reference/node-selection/state-comparison-caveats): Add `state:modified` subselectors, and reflect that it now includes changes to upstream macros. +- [Test selection examples](/reference/node-selection/test-selection-examples) includes more discussion of indirect selection (a change in v0.20), and the optional "greedy" flag/property (new in v0.21), which you can optionally set to include tests that have a mix of selected + unselected parents ### Elsewhere in Core -- [Resource configs and properties](configs-and-properties) docs have been consolidated and reconciled. New `config` property that makes it possible to configure models, seeds, snapshots, and tests in all yaml files. +- [Resource configs and properties](/reference/configs-and-properties) docs have been consolidated and reconciled. New `config` property that makes it possible to configure models, seeds, snapshots, and tests in all YAML files. - [Configuring incremental models](/docs/build/incremental-models): New optional configuration for incremental models, `on_schema_change`. -- [Environment variables](env_var): Add a log-scrubbing prefix, `DBT_ENV_SECRET_` -- [Test `where` config](where) has been reimplemented as a macro (`get_where_subquery`) that you can optionally reimplement, too -- [`dispatch`](dispatch) now supports reimplementing global macros residing in the `dbt` macro namespace with versions from installed packages, by leveraging `search_order` in the [`dispatch` project config](project-configs/dispatch-config) +- [Environment variables](/reference/dbt-jinja-functions/env_var): Add a log-scrubbing prefix, `DBT_ENV_SECRET_` +- [Test `where` config](/reference/resource-configs/where) has been reimplemented as a macro (`get_where_subquery`) that you can optionally reimplement, too +- [`dispatch`](/reference/dbt-jinja-functions/dispatch) now supports reimplementing global macros residing in the `dbt` macro namespace with versions from installed packages, by leveraging `search_order` in the [`dispatch` project config](/reference/project-configs/dispatch-config) ### Plugins -- **Postgres** [profile](/reference/warehouse-setups/postgres-setup) property `connect_timeout` now configurable. Also applicable to child plugins (e.g. `dbt-redshift`) -- **Redshift**: [profile](/reference/warehouse-setups/redshift-setup) property `ra3_node: true` to support cross-database source definitions and read-only querying -- **BigQuery**: [profile](/reference/warehouse-setups/bigquery-setup) property `execution_project` now configurable. [Snapshots](snapshots) support `target_project` and `target_dataset` config aliases. +- **Postgres** [profile](/docs/core/connect-data-platform/postgres-setup) property `connect_timeout` now configurable. Also applicable to child plugins (e.g. `dbt-redshift`) +- **Redshift**: [profile](/docs/core/connect-data-platform/redshift-setup) property `ra3_node: true` to support cross-database source definitions and read-only querying +- **BigQuery**: [profile](/docs/core/connect-data-platform/bigquery-setup) property `execution_project` now configurable. [Snapshots](/docs/build/snapshots) support `target_project` and `target_dataset` config aliases. diff --git a/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md b/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md index 7065b567bd8..8b33bfa3879 100644 --- a/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md +++ b/website/docs/guides/migration/versions/10-upgrading-to-v0.20.md @@ -4,7 +4,7 @@ title: "Upgrading to v0.20" --- :::caution Unsupported version -dbt Core v0.20 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](core-versions) for more details. +dbt Core v0.20 has reached the end of critical support. No new patch versions will be released, and it will stop running in dbt Cloud on June 30, 2022. Read ["About dbt Core versions"](/docs/dbt-versions/core) for more details. ::: ### Resources @@ -18,27 +18,27 @@ dbt Core v0.20 has reached the end of critical support. No new patch versions wi - Schema test macros are now `test` blocks, which we're going to start calling "generic tests." There is backwards compatibility for schema test macros prefixed `test_`, and you can still use them without switching to test blocks (though we hope you will soon!). The biggest breaking change is that _all_ tests now return a set of failing rows, rather than a single numeric value. This resolved a longstanding inconsistency between schema tests and data tests. - **For package maintainers (and some users):** The syntax for `adapter.dispatch()` has changed; see linked documentation below. - **For adapter plugin maintainers:** Macro dispatch now includes "parent" adapter implementations before using the default implementation. If you maintain an adapter plugin that inherits from another adapter (e.g. `dbt-redshift` inherits from `dbt-postgres`), `adapter.dispatch()` will now look for prefixed macros in the following order: `redshift__`, `postgres__`, `default__`. -- **For artifact users:** The [manifest](manifest-json) and [run_results](run-results-json) now use a v2 schema. What changed: there are a handful of new properties in the manifest; the number of failures for a test has been moved to a new property `failures`, so that `message` can be the human-readable failure message. +- **For artifact users:** The [manifest](/reference/artifacts/manifest-json) and [run_results](/reference/artifacts/run-results-json) now use a v2 schema. What changed: there are a handful of new properties in the manifest; the number of failures for a test has been moved to a new property `failures`, so that `message` can be the human-readable failure message. ## New and changed documentation ### Tests - [Building a dbt Project: tests](/docs/build/tests) -- [Test Configs](test-configs) -- [Test properties](resource-properties/tests) -- [Node Selection](node-selection/syntax) (with updated [test selection examples](test-selection-examples)) -- [Writing custom generic tests](custom-generic-tests) +- [Test Configs](/reference/test-configs) +- [Test properties](/reference/resource-properties/tests) +- [Node Selection](/reference/node-selection/syntax) (with updated [test selection examples](/reference/node-selection/test-selection-examples)) +- [Writing custom generic tests](/guides/best-practices/writing-custom-generic-tests) ### Elsewhere in Core -- [Parsing](parsing): rework of partial parsing, introduction of experimental parser -- The [graph](graph) Jinja context variable includes `exposures` +- [Parsing](/reference/parsing): rework of partial parsing, introduction of experimental parser +- The [graph](/reference/dbt-jinja-functions/graph) Jinja context variable includes `exposures` - [Packages](/docs/build/packages) can now be installed from git with a specific commit hash as the revision, or via sparse checkout if the dbt project is located in a `subdirectory`. -- [adapter.dispatch](dispatch) supports new arguments, a new [project-level config](project-configs/dispatch-config), and includes parent adapters when searching for macro implementations. -- [Exposures](exposure-properties) support `tags` and `meta` properties +- [adapter.dispatch](/reference/dbt-jinja-functions/dispatch) supports new arguments, a new [project-level config](/reference/project-configs/dispatch-config), and includes parent adapters when searching for macro implementations. +- [Exposures](/reference/exposure-properties) support `tags` and `meta` properties ### Plugins -- New partition-related [BigQuery configs](bigquery-configs#additional-partition-configs): `require_partition_filter` and `partition_expiration_days` -- On BigQuery, dbt can now add [query comment](query-comment) items as job labels +- New partition-related [BigQuery configs](/reference/resource-configs/bigquery-configs#additional-partition-configs): `require_partition_filter` and `partition_expiration_days` +- On BigQuery, dbt can now add [query comment](/reference/project-configs/query-comment) items as job labels - Snowflake and BigQuery [incremental models](/docs/build/incremental-models#strategy-specific-configs) using the `merge` strategy accept a new optional config, `merge_update_columns`. -- [Postgres configs](postgres-configs) now include first-class support for `indexes` +- [Postgres configs](/reference/resource-configs/postgres-configs) now include first-class support for `indexes` diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md index a0e24f2d456..e307c46fdf9 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-11-0.md @@ -4,7 +4,7 @@ id: "upgrading-to-0-11-0" --- ## Schema.yml v2 syntax -dbt v0.11.0 adds an auto-generated docs site to your dbt project. To make effective use of the documentation site, you'll need to use the new "version 2" schema.yml syntax. For a full explanation of the version 2 syntax, check out the [schema.yml Files](configs-and-properties) section of the documentation. +dbt v0.11.0 adds an auto-generated docs site to your dbt project. To make effective use of the documentation site, you'll need to use the new "version 2" schema.yml syntax. For a full explanation of the version 2 syntax, check out the [schema.yml Files](/reference/configs-and-properties) section of the documentation. ### Translating schema.yml files to the v2 syntax diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md index 7834c330b5c..14a70e177e8 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-13-0.md @@ -9,17 +9,17 @@ id: "upgrading-to-0-13-0" The special Jinja variable `{{this}}` is no longer implemented for `on-run-start` and `on-run-end` hooks. -Use a variable from the [`{{ target }}` context](target) or [`on-run-end` context](on-run-end-context) instead. +Use a variable from the [`{{ target }}` context](/reference/dbt-jinja-functions/target) or [`on-run-end` context](/reference/dbt-jinja-functions/on-run-end-context) instead. ### Adapter methods A number of materialization-specific adapter methods have changed in breaking ways. If you use these adapter methods in your macros or materializations, you may need to update your code accordingly. - - query_for_existing - **removed**, use [get_relation](adapter#get_relation) instead. - - [get_missing_columns](adapter#get_missing_columns) - changed to take `Relation`s instead of schemas and identifiers - - [expand_target_column_types](adapter#expand_target_column_types) - changed to take a `Relation` instead of schema, identifier - - [get_relation](adapter#get_relation) - added a `database` argument - - [create_schema](adapter#create_schema) - added a `database` argument - - [drop_schema](adapter#drop_schema) - added a `database` argument + - query_for_existing - **removed**, use [get_relation](/reference/dbt-jinja-functions/adapter#get_relation) instead. + - [get_missing_columns](/reference/dbt-jinja-functions/adapter#get_missing_columns) - changed to take `Relation`s instead of schemas and identifiers + - [expand_target_column_types](/reference/dbt-jinja-functions/adapter#expand_target_column_types) - changed to take a `Relation` instead of schema, identifier + - [get_relation](/reference/dbt-jinja-functions/adapter#get_relation) - added a `database` argument + - [create_schema](/reference/dbt-jinja-functions/adapter#create_schema) - added a `database` argument + - [drop_schema](/reference/dbt-jinja-functions/adapter#drop_schema) - added a `database` argument ## End of support diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md index c22bd1490bb..3b9c8560230 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-0.md @@ -12,7 +12,7 @@ This guide outlines migration instructions for: ## Upgrading to Snapshot Blocks -In dbt v0.14.0, `archives` have been replaced by `snapshots`. Snapshots accomplish the same goal as archives, but are more powerful and more flexible. For the complete guide on using snapshots, consult the [snapshot documentation](snapshots). +In dbt v0.14.0, `archives` have been replaced by `snapshots`. Snapshots accomplish the same goal as archives, but are more powerful and more flexible. For the complete guide on using snapshots, consult the [snapshot documentation](/docs/build/snapshots). There are a handful of changes to be aware of as you migrate from archives to snapshots: - meta column names are now prefixed with `dbt_` @@ -105,7 +105,7 @@ Next, inspect the new snapshots in your `snapshots/` directory. There should be When you are confident that the migration has completed successfully, you can manually delete the backup tables in your archived schema(s). These backup tables will be suffixed with `_dbt_archive_migration_backup`. -Snapshots participate in the dbt graph, so feel free to replace any `schema.table` references in your model code with `{{ ref('archive_name') }}`. You may also need to make changes to downstream models or reports to account for the changes to your snapshot meta-column names. Consult the [snapshot docs](snapshots) for full usage instructions. +Snapshots participate in the dbt graph, so feel free to replace any `schema.table` references in your model code with `{{ ref('archive_name') }}`. You may also need to make changes to downstream models or reports to account for the changes to your snapshot meta-column names. Consult the [snapshot docs](/docs/build/snapshots) for full usage instructions. ### Migrating archives manually (not recommended) @@ -156,7 +156,7 @@ The `--non-destructive` flag was problematic for a few reasons: Snowflake, BigQuery, SparkSQL, and Presto users should be unaffected by this change as there is limited merit to using the `--non-destructive` flag on these databases. -Redshift users should consider using the [bind: false](redshift-configs#late-binding-views) config to instruct dbt to create unbound views. +Redshift users should consider using the [bind: false](/reference/resource-configs/redshift-configs#late-binding-views) config to instruct dbt to create unbound views. Postgres users should ensure that they use table or incremental models for relations which are queried by end-users. diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md index a411b079292..a81740d5a68 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-14-1.md @@ -13,7 +13,7 @@ The following section only applies to Snapshots running against Snowflake or Big ::: -When a [Snapshot](snapshots) is configured to use the `check` strategy, dbt will compare the specified `check_cols` between the source dataset and the snapshotted dataset to determine if a row in the Snapshot has changed. A logic error in the v0.14.0 release of dbt caused this strategy to fail if the values of the specified `check_cols` for a given row cycled back into a previously known state. Importantly, this issue only affects Snowflake and BigQuery due to their respective uses of the `merge` statement in Snapshots. +When a [Snapshot](/docs/build/snapshots) is configured to use the `check` strategy, dbt will compare the specified `check_cols` between the source dataset and the snapshotted dataset to determine if a row in the Snapshot has changed. A logic error in the v0.14.0 release of dbt caused this strategy to fail if the values of the specified `check_cols` for a given row cycled back into a previously known state. Importantly, this issue only affects Snowflake and BigQuery due to their respective uses of the `merge` statement in Snapshots. In this failure mode, dbt would "finalize" existing records by setting a `dbt_valid_to` date for a changed record without correspondingly inserting a new record for the change. **In this state, the finalized records will no longer be tracked in the Snapshot **. diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md index ca54e6c8abb..02ab297c07a 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-15-0.md @@ -16,16 +16,16 @@ Compilation errors in .yml files are now treated as errors instead of warnings. The `table_name` field has been removed from Relations. Macros that expect this field will now return errors. See the latest -[class reference](dbt-classes#relation) for details. +[class reference](/reference/dbt-classes#relation) for details. ### Custom materializations -All materializations must now [manage dbt's Relation cache](creating-new-materializations#update-the-relation-cache). +All materializations must now [manage dbt's Relation cache](/guides/advanced/creating-new-materializations#update-the-relation-cache). ### dbt Server The existing `compile` and `execute` rpc tasks have been renamed to `compile_sql` and `execute_sql`. -For more details, see the latest [rpc docs](rpc). +For more details, see the latest [rpc docs](/reference/commands/rpc). ## Python requirements diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md index 8936f0ff3fc..a34f23c4c89 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-16-0.md @@ -18,7 +18,7 @@ after upgrading to v0.16.0. ### Seed type inference A number of improvements have been made to the type-inference logic in dbt. dbt -previously errantly converted string values in [seed CSV files](seeds) +previously errantly converted string values in [seed CSV files](/docs/build/seeds) like `sunday` or `March` into date timestamps in the year `0001`. This was obviously incorrect and has now been remedied, but if you _relied_ on this functionality, then this represents a breaking change. See @@ -44,7 +44,7 @@ will attempt to parse that string out to a field and data_type representation. A release of dbt will remove the ability for `partition_by` configs to be configured using a string. -See the docs on [BigQuery partitioning](bigquery-configs#partition-clause) for +See the docs on [BigQuery partitioning](/reference/resource-configs/bigquery-configs#partition-clause) for more information on the updated `partition_by` syntax for BigQuery models. See also [this guide](https://discourse.getdbt.com/t/bigquery-dbt-incremental-changes/982) for more information on how dbt leverages this new syntax to make incremental models build @@ -102,17 +102,17 @@ please be mindful of the following changes to dbt's Python dependencies: - Changed upper bound on `Jinja2 < 3` ## New and changed documentation -- [BigQuery partitioning configs](bigquery-configs) -- [Select specific seeds to run with `--select`](seed) +- [BigQuery partitioning configs](/reference/resource-configs/bigquery-configs) +- [Select specific seeds to run with `--select`](/reference/commands/seed) - [New `generate_database_name` macro](/docs/build/custom-databases#generate_database_name) -- [New `dbt_project.yml context`](dbt-project-yml-context) -- [New configurations for schema.yml files](configs-and-properties) +- [New `dbt_project.yml context`](/reference/dbt-jinja-functions/dbt-project-yml-context) +- [New configurations for schema.yml files](/reference/configs-and-properties) - [New configurations for Source declarations](/docs/build/sources) -- [New Postgres connection configs](/reference/warehouse-setups/postgres-setup) -- [New Snowflake KeyPair auth configs](/reference/warehouse-setups/snowflake-setup) -- [New `builtins` jinja context variable](builtins) -- [New `fromyaml` context method](fromyaml) -- [New `toyaml` context method](toyaml) -- [New `project_name` context variable](project_name) -- [New `dbt_version` context variable](dbt_version) -- [New `database_schemas` variable in the `on-run-end` context](on-run-end-context) +- [New Postgres connection configs](/docs/core/connect-data-platform/postgres-setup) +- [New Snowflake KeyPair auth configs](/docs/core/connect-data-platform/snowflake-setup) +- [New `builtins` jinja context variable](/reference/dbt-jinja-functions/builtins) +- [New `fromyaml` context method](/reference/dbt-jinja-functions/fromyaml) +- [New `toyaml` context method](/reference/dbt-jinja-functions/toyaml) +- [New `project_name` context variable](/reference/dbt-jinja-functions/project_name) +- [New `dbt_version` context variable](/reference/dbt-jinja-functions/dbt_version) +- [New `database_schemas` variable in the `on-run-end` context](/reference/dbt-jinja-functions/on-run-end-context) diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md index 7d7f70baa14..1f891ebc0f4 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-17-0.md @@ -131,7 +131,7 @@ This configuration will work in dbt v0.17.0 when `config-version: 2` is used, bu Support for version 1 will be removed in a future release of dbt. -### NativeEnvironment rendering for yaml fields +### NativeEnvironment rendering for YAML fields In dbt v0.17.0, dbt enabled use of Jinja's Native Environment to render values in YML files. This Native Environment coerces string values to their @@ -142,8 +142,8 @@ string-oriented inputs, like environment variables or command line variables. :::danger Heads up In dbt v0.17.1, native rendering is not enabled by default. It is possible to - natively render specific values using the [`as_bool`](as_bool), - [`as_number`](as_number), and [`as_native`](as_native) filters. + natively render specific values using the [`as_bool`](/reference/dbt-jinja-functions/as_bool), + [`as_number`](/reference/dbt-jinja-functions/as_number), and [`as_native`](/reference/dbt-jinja-functions/as_native) filters. The examples below have been updated to reflect 0.17.1 functionality. @@ -246,13 +246,13 @@ BigQuery: ## New and changed documentation **Core** -- [`path:` selectors](node-selection/methods#the-path-method) -- [`--fail-fast`](commands/run#failing-fast) +- [`path:` selectors](/reference/node-selection/methods#the-path-method) +- [`--fail-fast`](/reference/commands/run#failing-fast) - [as_text Jinja filter](/reference/dbt-jinja-functions/as_text) - [accessing nodes in the `graph` object](/reference/dbt-jinja-functions/graph) -- [persist_docs](resource-configs/persist_docs) +- [persist_docs](/reference/resource-configs/persist_docs) - [source properties](reference/source-properties) -- [source overrides](resource-properties/overrides) +- [source overrides](/reference/resource-properties/overrides) **BigQuery** -- [maximum_bytes_billed](/reference/warehouse-setups/bigquery-setup#maximum-bytes-billed) +- [maximum_bytes_billed](/docs/core/connect-data-platform/bigquery-setup#maximum-bytes-billed) diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md index de0e6d6cb8b..8092ad807b8 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-18-0.md @@ -59,25 +59,25 @@ can override schema test definitions ## New and changed documentation **Core** -- [node selection syntax](node-selection/syntax) -- [list (ls)](commands/list) -- [defer](defer) -- [adapter.dispatch](adapter#dispatch) -- [`asset-paths` config](asset-paths) (also updated [dbt_project.yml](reference/dbt_project.yml) and [description](description)) -- [flag for colorized logs](run#enable-or-disable-colorized-logs) -- [`full_refresh` config](full_refresh) +- [node selection syntax](/reference/node-selection/syntax) +- [list (ls)](/reference/commands/list) +- [defer](/reference/node-selection/defer) +- [adapter.dispatch](/reference/dbt-jinja-functions/adapter#dispatch) +- [`asset-paths` config](/reference/project-configs/asset-paths) (also updated [dbt_project.yml](/reference/dbt_project.yml) and [description](/reference/resource-properties/description)) +- [flag for colorized logs](/reference/commands/run#enable-or-disable-colorized-logs) +- [`full_refresh` config](/reference/resource-configs/full_refresh) **Docs** -- [project-level overviews](documentation#custom-project-level-overviews) +- [project-level overviews](/docs/collaborate/documentation#custom-project-level-overviews) **Redshift** -- [`iam_profile`](/reference/warehouse-setups/redshift-setup#specifying-an-iam-profile) +- [`iam_profile`](/docs/core/connect-data-platform/redshift-setup#specifying-an-iam-profile) **Snowflake** -- `query_tag` in [profile](/reference/warehouse-setups/snowflake-setup), [model config](snowflake-configs#query-tags) -- automatic SSO [session caching](snowflake-configs#sso-authentication) support +- `query_tag` in [profile](/docs/core/connect-data-platform/snowflake-setup), [model config](/reference/resource-configs/snowflake-configs#query-tags) +- automatic SSO [session caching](/docs/core/connect-data-platform/snowflake-setup#sso-authentication) support **BigQuery** -- [`impersonate_service_account`]/reference/warehouse-setups/bigquery-setup#service-account-impersonation) -- [`policy_tags`](bigquery-configs#policy-tags) -- [`hours_to_expiration`](bigquery-configs#controlling-table-expiration) +- [`impersonate_service_account`](/docs/core/connect-data-platform/bigquery-setup#service-account-impersonation) +- [`policy_tags`](/reference/resource-configs/bigquery-configs#policy-tags) +- [`hours_to_expiration`](/reference/resource-configs/bigquery-configs#controlling-table-expiration) diff --git a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md index c5ed7feb6ce..0dd428780e0 100644 --- a/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md +++ b/website/docs/guides/migration/versions/11-Older versions/upgrading-to-0-19-0.md @@ -23,7 +23,7 @@ See the docs below for more details. We don't expect these to require action in #### Deprecations -Removed support for `config-version: 1` of dbt_project.yml, which was deprecated in v0.17.0. Use `config-version: 2` in all projects and installed packages. Otherwise, dbt will raise an error. See docs on [config-version](config-version) and the [v0.17.0 Migration Guide](/guides/migration/versions) for details. +Removed support for `config-version: 1` of dbt_project.yml, which was deprecated in v0.17.0. Use `config-version: 2` in all projects and installed packages. Otherwise, dbt will raise an error. See docs on [config-version](/reference/project-configs/config-version) and the [v0.17.0 Migration Guide](/guides/migration/versions) for details. ### For dbt plugin maintainers @@ -42,22 +42,22 @@ See [dbt#2961](https://github.com/dbt-labs/dbt-core/pull/2961) for full implemen ## New and changed documentation ### Core -- [dbt Artifacts](dbt-artifacts): The artifacts produced by dbt—manifest, catalog, run results, and sources—are simpler to consume and more clearly documented. -- [dbt Classes](dbt-classes#result-objects), [on-run-end Context](on-run-end-context#results): The `Result` object has a new schema, in line with changes to `run_results.json`. -- [Statement blocks](statement-blocks): The `call statement` result `status` string is now a structured object named `response`. -- [Snapshots](snapshots#invalidate_hard_deletes): If the config `invalidate_hard_deletes` is enabled, `dbt snapshot` will update records whose unique key no longer exist in the snapshot query. Should those uniquely identified records "revive," `dbt snapshot` will re-add them. -- [YAML selectors](yaml-selectors) support a `description` property and record their expanded dictionary representations in the manifest. -- [Modules](modules): The regex python module, `re`, is available in dbt's Jinja context. -- [parse](parse): New command to parse a dbt project and write detailed timing info. +- [dbt Artifacts](/docs/deploy/artifacts): The artifacts produced by dbt—manifest, catalog, run results, and sources—are simpler to consume and more clearly documented. +- [dbt Classes](/reference/dbt-classes#result-objects), [on-run-end Context](/reference/dbt-jinja-functions/on-run-end-context#results): The `Result` object has a new schema, in line with changes to `run_results.json`. +- [Statement blocks](/reference/dbt-jinja-functions/statement-blocks): The `call statement` result `status` string is now a structured object named `response`. +- [Snapshots](/docs/build/snapshots#snapshot-configurations): If the config `invalidate_hard_deletes` is enabled, `dbt snapshot` will update records whose unique key no longer exist in the snapshot query. Should those uniquely identified records "revive," `dbt snapshot` will re-add them. +- [YAML selectors](/reference/node-selection/yaml-selectors) support a `description` property and record their expanded dictionary representations in the manifest. +- [Modules](/reference/dbt-jinja-functions/modules): The regex python module, `re`, is available in dbt's Jinja context. +- [parse](/reference/commands/parse): New command to parse a dbt project and write detailed timing info. #### State -- [Understanding state](understanding-state): New docs outlining the conceptual background of state-informed runs, as well as the [known caveats](state-comparison-caveats) for state comparison. In v0.19.0, dbt is a little bit smarter at identifying `state:modified` "false positives" that previously resulted from env-based configurations in `dbt_project`. -- [Defer](defer) has changed: Instead of deferring all unselected node references, dbt now defers an unselected node reference _if and only if_ it does not exist in the current environment. Tests can defer their upstream references as well. This better supports the "Slim CI" use case by addressing the current environment's resources across `seed`, `run`, and `test` steps. -- [RPC](rpc): Added `state` and `defer` as arguments to RPC methods for which it is supported on the CLI. +- [About state](/reference/node-selection/syntax#about-node-selection): New docs outlining the conceptual background of state-informed runs, as well as the [known caveats](/reference/node-selection/state-comparison-caveats) for state comparison. In v0.19.0, dbt is a little bit smarter at identifying `state:modified` "false positives" that previously resulted from env-based configurations in `dbt_project`. +- [Defer](/reference/node-selection/defer) has changed: Instead of deferring all unselected node references, dbt now defers an unselected node reference _if and only if_ it does not exist in the current environment. Tests can defer their upstream references as well. This better supports the "Slim CI" use case by addressing the current environment's resources across `seed`, `run`, and `test` steps. +- [RPC](/reference/commands/rpc): Added `state` and `defer` as arguments to RPC methods for which it is supported on the CLI. ### BigQuery -- [BigQuery profile](/reference/warehouse-setups/bigquery-setup): dbt can connect via OAuth tokens (one-time or refresh), and it can use the default project when connecting via `gcloud` oauth. -- [Hourly, monthly and yearly partitions](bigquery-configs#partitioning-by-a-date-or-timestamp): With a new `granularity` attribute of the `partition_by` config, dbt can materialize models as tables partitioned by hour, month, or year. +- [BigQuery profile](/docs/core/connect-data-platform/bigquery-setup): dbt can connect via OAuth tokens (one-time or refresh), and it can use the default project when connecting via `gcloud` oauth. +- [Hourly, monthly and yearly partitions](/reference/resource-configs/bigquery-configs#partitioning-by-a-date-or-timestamp): With a new `granularity` attribute of the `partition_by` config, dbt can materialize models as tables partitioned by hour, month, or year. ### Spark -- [Spark profile](/reference/warehouse-setups/spark-setup): The `thrift` and `http` connection methods require installation of a `PyHive` extra. +- [Spark profile](/docs/core/connect-data-platform/spark-setup): The `thrift` and `http` connection methods require installation of a `PyHive` extra. diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md index 86a5c669be6..d6760771b79 100644 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md +++ b/website/docs/guides/orchestration/airflow-and-dbt-cloud/1-airflow-and-dbt-cloud.md @@ -15,17 +15,17 @@ In some cases, [Airflow](https://airflow.apache.org/) may be the preferred orche ### Airflow + dbt Core -There are so many great examples from Gitlab through their open source data engineering work. Example: [here](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py). This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details: [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/) +There are [so many great examples](https://gitlab.com/gitlab-data/analytics/-/blob/master/dags/transformation/dbt_snowplow_backfill.py) from GitLab through their open source data engineering work. This is especially appropriate if you are well-versed in Kubernetes, CI/CD, and docker task management when building your airflow pipelines. If this is you and your team, you’re in good hands reading through more details [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/infrastructure/#airflow) and [here](https://about.gitlab.com/handbook/business-technology/data-team/platform/dbt-guide/). ### Airflow + dbt Cloud API w/Custom Scripts -This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available: [here](https://registry.astronomer.io/providers/dbt-cloud?type=Sensors&utm_campaign=Monthly%20Product%20Updates&utm_medium=email&_hsmi=208603877&utm_content=208603877&utm_source=hs_email) +This has served as a bridge until the fabled Astronomer + dbt Labs-built dbt Cloud provider became generally available [here](https://registry.astronomer.io/providers/dbt%20Cloud/versions/latest). There are many different permutations of this over time: -- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on custom python API utilities [here](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) -- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want -- [Other ways to run dbt in airflow](https://docs.getdbt.com/docs/running-a-dbt-project/running-dbt-in-production/#using-airflow): Official dbt Docs on how teams are running dbt in airflow +- [Custom Python Scripts](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_example.py): This is an airflow DAG based on [custom python API utilities](https://github.com/sungchun12/airflow-dbt-cloud/blob/main/archive/dbt_cloud_utils.py) +- [Make API requests directly through the BashOperator based on the docs](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#operation/triggerRun): You can make cURL requests to invoke dbt Cloud to do what you want +- For more options, check out the [official dbt Docs](/docs/deploy/deployments#airflow) on the various ways teams are running dbt in airflow ## This guide's process @@ -48,7 +48,7 @@ While you’re learning the ropes, you’ll also gain a better understanding of ### Prerequisites -- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/collaborate/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens). +- [dbt Cloud Teams or Enterprise account](https://www.getdbt.com/pricing/) (with [admin access](https://docs.getdbt.com/docs/cloud/manage-access/enterprise-permissions)) in order to create a service token. Permissions for service tokens can be found [here](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#permissions-for-service-account-tokens). - A [free Docker account](https://hub.docker.com/signup) in order to sign in to Docker Desktop, which will be installed in the initial setup. - A local digital scratchpad for temporarily copy-pasting API keys and URLs diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md index ab847c526a0..9c3b8eb7f1b 100644 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md +++ b/website/docs/guides/orchestration/airflow-and-dbt-cloud/2-setting-up-airflow-and-dbt-cloud.md @@ -10,7 +10,7 @@ Astro is a managed software service that includes key features for teams working In this example, we’re using Homebrew to install Astro CLI. Follow the instructions to install the Astro CLI for your own operating system [here](https://docs.astronomer.io/astro/install-cli). ```bash -brew install astronomer/cloud/astrocloud +brew install astro ``` @@ -40,32 +40,32 @@ You can initialize an Astronomer project in an empty local directory using a Doc 1. Run the following commands to initialize your project and start your local Airflow deployment: - ```bash - astrocloud dev init - astrocloud dev start - ``` + ```bash + astro dev init + astro dev start + ``` - When this finishes, you should see a message similar to the following: + When this finishes, you should see a message similar to the following: - ```bash - Airflow is starting up! This might take a few minutes… + ```bash + Airflow is starting up! This might take a few minutes… - Project is running! All components are now available. + Project is running! All components are now available. - Airflow Webserver: http://localhost:8080 - Postgres Database: localhost:5432/postgres - The default Airflow UI credentials are: admin:admin - The default Postrgres DB credentials are: postgres:postgres - ``` + Airflow Webserver: http://localhost:8080 + Postgres Database: localhost:5432/postgres + The default Airflow UI credentials are: admin:admin + The default Postrgres DB credentials are: postgres:postgres + ``` 2. Open the Airflow interface. Launch your web browser and navigate to the address for the **Airflow Webserver** from your output in Step 1. - This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**: + This will take you to your local instance of Airflow. You’ll need to log in with the **default credentials**: - - Username: admin - - Password: admin + - Username: admin + - Password: admin - ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png) + ![Airflow login screen](/img/guides/orchestration/airflow-and-dbt-cloud/airflow-login.png) @@ -77,7 +77,7 @@ Create a service token from within dbt Cloud using the instructions [found here] ## 6. Create a dbt Cloud job -In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](https://docs.getdbt.com/docs/get-started/getting-started/building-your-first-project/schedule-a-job). +In your dbt Cloud account create a job, paying special attention to the information in the bullets below. Additional information for creating a dbt Cloud job can be found [here](/quickstarts/bigquery). - Configure the job with the commands that you want to include when this job kicks off, as Airflow will be referring to the job’s configurations for this rather than being explicitly coded in the Airflow DAG. This job will run a set of commands rather than a single command. - Ensure that the schedule is turned **off** since we’ll be using Airflow to kick things off. diff --git a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md b/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md index d6cecd1457d..5766d8c0b79 100644 --- a/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md +++ b/website/docs/guides/orchestration/airflow-and-dbt-cloud/4-airflow-and-dbt-cloud-faqs.md @@ -34,7 +34,7 @@ Our friends at Astronomer answer this question with this example: [here](https:/ Check out these two resources for accomplishing your own CI/CD pipeline: -- [Continuous Integration with dbt Cloud](/docs/deploy/cloud-ci-job) +- [Continuous Integration with dbt Cloud](/docs/deploy/continuous-integration) - [Astronomer's CI/CD Example](https://docs.astronomer.io/software/ci-cd/#example-cicd-workflow) ## 6. Can dbt dynamically create tasks in the DAG like Airflow can? diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md index 4dbcbb6c6c7..a66259c6c49 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/1-cicd-background.md @@ -1,10 +1,8 @@ --- -title: Customizing CI/CD +title: Customizing CI/CD with Custom Pipelines id: 1-cicd-background --- -# Creating Custom CI/CD Pipelines - One of the core tenets of dbt is that analytic code should be version controlled. This provides a ton of benefit to your organization in terms of collaboration, code consistency, stability, and the ability to roll back to a prior version. There’s an additional benefit that is provided with your code hosting platform that is often overlooked or underutilized. Some of you may have experience using dbt Cloud’s [webhook functionality](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) to run a job when a PR is created. This is a fantastic capability, and meets most use cases for testing your code before merging to production. However, there are circumstances when an organization needs additional functionality, like running workflows on every commit (linting), or running workflows after a merge is complete. In this article, we will show you how to setup custom pipelines to lint your project and trigger a dbt Cloud job via the API. A note on parlance in this article since each code hosting platform uses different terms for similar concepts. The terms `pull request` (PR) and `merge request` (MR) are used interchangeably to mean the process of merging one branch into another branch. @@ -23,9 +21,11 @@ Please note, runners hosted by your code hosting platform provide a certain amou - Repo-hosted runner billing information: - [GitHub](https://docs.github.com/en/billing/managing-billing-for-github-actions/about-billing-for-github-actions) - [GitLab](https://docs.gitlab.com/ee/ci/pipelines/cicd_minutes.html) + - [Bitbucket](https://bitbucket.org/product/features/pipelines#) - Self-hosted runner information: - [GitHub](https://docs.github.com/en/actions/hosting-your-own-runners/about-self-hosted-runners) - [GitLab](https://docs.gitlab.com/runner/) + - [Bitbucket](https://support.atlassian.com/bitbucket-cloud/docs/runners/) Additionally, if you’re using the free tier of GitLab you can still follow this guide, but it may ask you to provide a credit card to verify your account. You’ll see something like this the first time you try to run a pipeline: @@ -34,7 +34,7 @@ Additionally, if you’re using the free tier of GitLab you can still follow thi ## How to setup pipelines -This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab) you will see a selection option for each. +This guide provides details for multiple code hosting platforms. Where steps are unique, they are presented without a selection option. If code is specific to a platform (i.e. GitHub, GitLab, Bitbucket) you will see a selection option for each. Pipelines can be triggered by various events. The [dbt Cloud webhook](https://docs.getdbt.com/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration) process already triggers a run if you want to run your jobs on a merge request, so this guide focuses on running pipelines for every push and when PRs are merged. Since pushes happen frequently in a project, we’ll keep this job super simple and fast by linting with SQLFluff. The pipeline that runs on merge requests will run less frequently, and can be used to call the dbt Cloud API to trigger a specific job. This can be helpful if you have specific requirements that need to happen when code is updated in production, like running a `--full-refresh` on all impacted incremental models. diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md b/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md deleted file mode 100644 index 5031ea50844..00000000000 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/2-lint-on-push.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -title: Lint code on push -id: 2-lint-on-push ---- - -This section shows a very basic example of linting a project every time a commit is pushed to the repo. While it is simple, it shows the power of CI and can be expanded on to meet the needs of your organization. - -The steps below use [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. In the example, it's set to use the `snowflake` dialect, and specfically runs the rules L019, L020, L021, and L022. This is purley for demonstration purposes. You should update this to reflect your code base's [dialect](https://docs.sqlfluff.com/en/stable/dialects.html) and the [rules](https://docs.sqlfluff.com/en/stable/rules.html) you've established for your repo. - -### 1. Create a yaml file to define your pipeline - -The yaml files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed. - - - - -In order for GitHub to know that you want to run an action, you need to have a few specific folders in your project. Add a new folder named `.github`, and within that folder add a new one named `workflows`. Your final folder structure will look like this: - -```sql -my_awesome_project -├── .github -│ ├── workflows -│ │ └── lint_on_push.yml -``` - -To define the job for our action, let’s add a new file named `lint_on_push.yml` under the `workflows` folder. This file is how we tell the GitHub runner what to execute when the job is triggered. - -Below I touch on the important pieces for running a dbt Cloud job, but if you want a full run-down of all the components of this yaml file checkout [this GitHub article](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on actions. - -**Key pieces:** - -- `on:` - this is used to filter when the pipeline is run. In this example we’re running it on every push except for pushes to branches named `main`. For more filters, checkout [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). -- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job -- `uses:` - remember the virtual servers we coved in the background section? They’re just empty operating systems, so there are two pieces of setup that are needed in order to access the code in your repo, and setup Python correctly on the virtual server. These two actions are called from other repos in GitHub to provide those services. For more information on them, checkout their repos: [actions/checkout](https://github.com/actions/checkout#checkout-v3) and [actions/setup-python](https://github.com/actions/setup-python#setup-python-v3). -- `run:` - this is how we’re telling the GitHub runner to execute the Python script we defined above. - -```yaml -name: lint dbt project on push - -on: - push: - branches-ignore: - - 'main' - -jobs: -# this job runs SQLFluff with a specific set of rules - # note the dialect is set to Snowflake, so make that specific to your setup - # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html - lint_project: - name: Run SQLFluff linter - runs-on: ubuntu-latest - - steps: - - uses: "actions/checkout@v3" - - uses: "actions/setup-python@v2" - with: - python-version: "3.9" - - name: Install SQLFluff - run: "pip install sqlfluff==0.13.1" - - name: Lint project - run: "sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022" - -``` - - - - -Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script above. You’ll put the code below into this file. - -```sql -my_awesome_project -├── dbt_project.yml -├── .gitlab-ci.yml -``` - -**Key pieces:** - -- `image: python:3.9` - this defines the virtual image we’re using to run the job -- `rules:` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event except when the branch is named `main`. Filters are very powerful to run commands on specific events, and you can find a full list in [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). -- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above. - -```yaml -image: python:3.9 - -stages: - - pre-build - -# this job runs SQLFluff with a specific set of rules -# note the dialect is set to Snowflake, so make that specific to your setup -# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html -lint-project: - stage: pre-build - rules: - - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main' - script: - - pip install sqlfluff==0.13.1 - - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 -``` - - - - -### 2. Commit and push your changes to make sure everything works - -After you finish creating the yaml files, commit and push your code. Doing this will trigger your pipeline for the first time! If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. - - - - -In your repository, click the *Actions* tab - -![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png) - -Sample output from SQLFluff in the `Run SQLFluff linter` job: - -![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png) - - - - -In the menu option got to *CI/CD > Pipelines* - -![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png) - -Sample output from SQLFluff in the `Run SQLFluff linter` job: - -![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png) - - - diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md b/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md index 042e6ea3d61..d22d1d14284 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge.md @@ -1,11 +1,11 @@ --- -title: Run a dbt Cloud job on merge +title: Run a dbt Cloud job on merge id: 3-dbt-cloud-job-on-merge --- -This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts persented here can be generalized and used in whatever way best suits your use case. +This job will take a bit more to setup, but is a good example of how to call the dbt Cloud API from a CI/CD pipeline. The concepts presented here can be generalized and used in whatever way best suits your use case. -The setup below shows how to call the dbt Cloud API to run a job every time there is a push to your main branch (i.e. a PR is merged). +The setup below shows how to call the dbt Cloud API to run a job every time there's a push to your main branch (The branch where pull requests are typically merged. Commonly referred to as the main, primary, or master branch, but can be named differently). ### 1. Get your dbt Cloud API key @@ -35,6 +35,8 @@ This next part will happen in you code hosting platform. We need to save your AP values={[ { label: 'GitHub', value: 'github', }, {label: 'GitLab', value: 'gitlab', }, + {label: 'Azure DevOps', value: 'ado', }, + {label: 'Bitbucket', value: 'bitbucket', }, ] }> @@ -57,6 +59,7 @@ Here’s a video showing these steps: + In GitLab: @@ -76,6 +79,45 @@ In GitLab: + + + +In Azure: + +- Open up your Azure DevOps project where you want to run the pipeline (the same one that houses your dbt project) +- Click on *Pipelines* and then *Create Pipeline* +- Select where your git code is located. It should be *Azure Repos Git* + - Select your git repository from the list +- Select *Starter pipeline* (this will be updated later in Step 4) +- Click on *Variables* and then *New variable* +- In the *Name* field, enter the `DBT_API_KEY` + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** +- In the *Value* section, paste in the key you copied from dbt Cloud +- Make sure the check box next to *Keep this value secret* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI. +- Click *OK* and then *Save* to save the variable +- Save your new Azure pipeline + + + + + + +In Bitbucket: + +- Open up your repository where you want to run the pipeline (the same one that houses your dbt project) +- In the left menu, click *Repository Settings* +- Scroll to the bottom of the left menu, and select *Repository variables* +- In the *Name* field, input `DBT_API_KEY` + - **It’s very important that you copy/paste this name exactly because it’s used in the scripts below.** +- In the *Value* section, paste in the key you copied from dbt Cloud +- Make sure the check box next to *Secured* is checked. This will mask the value in logs, and you won't be able to see the value for the variable in the UI. +- Click *Add* to save the variable + + ![View of the Bitbucket window for entering DBT_API_KEY](/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png) + + Here’s a video showing these steps: + + @@ -122,6 +164,8 @@ In order to call the dbt Cloud API, there are a few pieces of info the script ne values={[ { label: 'GitHub', value: 'github', }, {label: 'GitLab', value: 'gitlab', }, + {label: 'Azure DevOps', value: 'ado', }, + {label: 'Bitbucket', value: 'bitbucket', }, ] }> @@ -138,16 +182,15 @@ my_awesome_project │ │ └── lint_on_push.yml ``` -The yaml file will look pretty similar to our earlier job, but there is a new section called `env` that we’ll use to pass in the required variables. Update the variables below to match your setup based on the comments in the file. +The YAML file will look pretty similar to our earlier job, but there is a new section called `env` that we’ll use to pass in the required variables. Update the variables below to match your setup based on the comments in the file. It’s worth noting that we changed the `on:` section to now run **only** when there are pushes to a branch named `main` (i.e. a PR is merge). Have a look through [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows) on these filters for additional use cases. ```yaml - name: run dbt Cloud job on push # This filter says only run this job when there is a push to the main branch -# This works off the assumption that you've restrictred this branch to only all PRs to push to the deafult branch +# This works off the assumption that you've restricted this branch to only all PRs to push to the default branch # Update the name to match the name of your default branch on: push: @@ -172,7 +215,7 @@ jobs: steps: - uses: "actions/checkout@v3" - - uses: "actions/setup-python@v2" + - uses: "actions/setup-python@v4" with: python-version: "3.9" - name: Run dbt Cloud job @@ -182,9 +225,9 @@ jobs: -For this job, we'll set it up using the `gitlab-ci.yml` file as in the prior step (see Step 1 of the linting setup for more info). The yaml file will look pretty similar to our earlier job, but there is a new section called `variables` that we’ll use to pass in the required variables to the Python script. Update this section to match your setup based on the comments in the file. +For this job, we'll set it up using the `gitlab-ci.yml` file as in the prior step (see Step 1 of the linting setup for more info). The YAML file will look pretty similar to our earlier job, but there is a new section called `variables` that we’ll use to pass in the required variables to the Python script. Update this section to match your setup based on the comments in the file. -Please note that the `rules:` section now says to run **only** when there are pushes to a branch named `main` (i.e. a PR is merge). Have a look through [GitLab’s docs](https://docs.gitlab.com/ee/ci/yaml/#rules) on these filters for additional use cases. +Please note that the `rules:` section now says to run **only** when there are pushes to a branch named `main`, such as a PR being merged. Have a look through [GitLab’s docs](https://docs.gitlab.com/ee/ci/yaml/#rules) on these filters for additional use cases. + + + +For this new job, open the existing Azure pipeline you created above and select the *Edit* button. We'll want to edit the corresponding Azure pipeline YAML file with the appropriate configuration, instead of the starter code, along with including a `variables` section to pass in the required variables. + +Copy the below YAML file into your Azure pipeline and update the variables below to match your setup based on the comments in the file. It's worth noting that we changed the `trigger` section so that it will run **only** when there are pushes to a branch named `main` (like a PR merged to your main branch). + +Read through [Azure's docs](https://learn.microsoft.com/en-us/azure/devops/pipelines/build/triggers?view=azure-devops) on these filters for additional use cases. + +```yaml +name: Run dbt Cloud Job + +trigger: [ main ] # runs on pushes to main + +variables: + DBT_URL: https://cloud.getdbt.com # no trailing backslash, adjust this accordingly for single-tenant deployments + DBT_JOB_CAUSE: 'Azure Pipeline CI Job' # provide a descriptive job cause here for easier debugging down the road + DBT_ACCOUNT_ID: 00000 # enter your account id + DBT_PROJECT_ID: 00000 # enter your project id + DBT_PR_JOB_ID: 00000 # enter your job id + +steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '3.7' + displayName: 'Use Python 3.7' + + - script: | + pip install requests + displayName: 'Install python dependencies' + + - script: | + python -u ./python/run_and_monitor_dbt_job.py + displayName: 'Run dbt job ' + env: + DBT_API_KEY: $(DBT_API_KEY) # Set these values as secrets in the Azure pipelines Web UI +``` + + + + +For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step (see Step 1 of the linting setup for more info). The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file. + + + + +```yaml +image: python:3.11.1 + + +pipelines: + branches: + 'main': # override if your default branch doesn't run on a branch named "main" + - step: + name: 'Run dbt Cloud Job' + script: + - export DBT_URL="https://cloud.getdbt.com" # if you have a single-tenant deployment, adjust this accordingly + - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job" + - export DBT_ACCOUNT_ID=00000 # enter your account id here + - export DBT_PROJECT_ID=00000 # enter your project id here + - export DBT_PR_JOB_ID=00000 # enter your job id here + - python python/run_and_monitor_dbt_job.py +``` + + + + +```yaml +image: python:3.11.1 + + +pipelines: + branches: + '**': # this sets a wildcard to run on every branch unless specified by name below + - step: + name: Lint dbt project + script: + - pip install sqlfluff==0.13.1 + - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 + + 'main': # override if your default branch doesn't run on a branch named "main" + - step: + name: 'Run dbt Cloud Job' + script: + - export DBT_URL="https://cloud.getdbt.com" # if you have a single-tenant deployment, adjust this accordingly + - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job" + - export DBT_ACCOUNT_ID=00000 # enter your account id here + - export DBT_PROJECT_ID=00000 # enter your project id here + - export DBT_PR_JOB_ID=00000 # enter your job id here + - python python/run_and_monitor_dbt_job.py +``` + + + + + @@ -276,6 +421,8 @@ Additionally, you’ll see the job in the run history of dbt Cloud. It should be values={[ { label: 'GitHub', value: 'github', }, {label: 'GitLab', value: 'gitlab', }, + {label: 'Azure DevOps', value: 'ado', }, + {label: 'Bitbucket', value: 'bitbucket', }, ] }> @@ -287,9 +434,23 @@ Additionally, you’ll see the job in the run history of dbt Cloud. It should be -![dbt run on merge job in GitLub](/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-gitlab.png) +![dbt run on merge job in GitLab](/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-gitlab.png) + +![dbt Cloud job showing it was triggered by GitLab](/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-gitlab-triggered.png) + + + + + + + + + + + +![dbt run on merge job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-bitbucket.png) -![dbt Cloud job showing it was triggered by GitLub](/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-gitlab-triggered.png) +![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-bitbucket-triggered.png) diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md new file mode 100644 index 00000000000..1a75fdc17ac --- /dev/null +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/4-dbt-cloud-job-on-pr.md @@ -0,0 +1,131 @@ +--- +title: Run a dbt Cloud job on pull request +id: 4-dbt-cloud-job-on-pr +--- + +:::info Run on PR + +If your git provider has a native integration with dbt Cloud, you can take advantage of the setup instructions [here](/docs/deploy/ci-jobs). +This section is only for those projects that connect to their git repository using an SSH key. + +::: + +If your git provider is not one with a native integration with dbt Cloud, but you still want to take advantage of CI builds, you've come to the right spot! With just a bit of work it's possible to setup a job that will run a dbt Cloud job when a pull request (PR) is created. + +The setup for this pipeline will use the same steps as the prior page. Before moving on, **follow steps 1-5 from the [prior page](https://docs.getdbt.com/guides/orchestration/custom-cicd-pipelines/3-dbt-cloud-job-on-merge)** + +### 6. Create a pipeline job that runs when PRs are created + + + +For this job, we'll set it up using the `bitbucket-pipelines.yml` file as in the prior step. The YAML file will look pretty similar to our earlier job, but we’ll pass in the required variables to the Python script using `export` statements. Update this section to match your setup based on the comments in the file. + +**What is this pipeline going to do?** +The setup below will trigger a dbt Cloud job to run every time a PR is opened in this repository. It will also run a fresh version of the pipeline for every commit that is made on the PR until it is merged. +For example: If you open a PR, it will run the pipeline. If you then decide additional changes are needed, and commit/push to the PR branch, a new pipeline will run with the updated code. + +The following varibles control this job: + - `DBT_JOB_BRANCH`: Tells the dbt Cloud job to run the code in the branch that created this PR + - `DBT_JOB_SCHEMA_OVERRIDE`: Tells the dbt Cloud job to run this into a custom target schema + - The format of this will look like: `DBT_CLOUD_PR_{REPO_KEY}_{PR_NUMBER}` + + +```yaml +image: python:3.11.1 + + +pipelines: + # This job will run when pull requests are created in the repository + pull-requests: + '**': + - step: + name: 'Run dbt Cloud PR Job' + script: + # Check to only build if PR destination is master (or other branch). + # Comment or remove line below if you want to run on all PR's regardless of destination branch. + - if [ "${BITBUCKET_PR_DESTINATION_BRANCH}" != "main" ]; then printf 'PR Destination is not master, exiting.'; exit; fi + - export DBT_URL="https://cloud.getdbt.com" + - export DBT_JOB_CAUSE="Bitbucket Pipeline CI Job" + - export DBT_JOB_BRANCH=$BITBUCKET_BRANCH + - export DBT_JOB_SCHEMA_OVERRIDE="DBT_CLOUD_PR_"$BITBUCKET_PROJECT_KEY"_"$BITBUCKET_PR_ID + - export DBT_ACCOUNT_ID=00000 # enter your account id here + - export DBT_PROJECT_ID=00000 # enter your project id here + - export DBT_PR_JOB_ID=00000 # enter your job id here + - python python/run_and_monitor_dbt_job.py +``` + + + + +### 7. Confirm the pipeline runs + +Now that you have a new pipeline, it's time to run it and make sure it works. Since this only triggers when a PR is created, you'll need to create a new PR on a branch that contains the code above. Once you do that, you should see a pipeline that looks like this: + + + + +Bitbucket pipeline: +![dbt run on PR job in Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png) + +dbt Cloud job: +![dbt Cloud job showing it was triggered by Bitbucket](/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png) + + + + +### 8. Handle those extra schemas in your database + +As noted above, when the PR job runs it will create a new schema based on the PR. To avoid having your database overwhelmed with PR schemas, consider adding a "cleanup" job to your dbt Cloud account. This job can run on a scheduled basis to cleanup any PR schemas that haven't been updated/used recently. + +Add this as a macro to your project. It takes 2 arguments that lets you control which schema get dropped: + - `age_in_days`: The number of days since the schema was last altered before it should be dropped (default 10 days) + - `database_to_clean`: The name of the database to remove schemas from + +```sql +{# + This macro finds PR schemas older than a set date and drops them + The macro defaults to 10 days old, but can be configured with the input argument age_in_days + Sample usage with different date: + dbt run-operation pr_schema_cleanup --args "{'database_to_clean': 'analytics','age_in_days':'15'}" +#} +{% macro pr_schema_cleanup(database_to_clean, age_in_days=10) %} + + {% set find_old_schemas %} + select + 'drop schema {{ database_to_clean }}.'||schema_name||';' + from {{ database_to_clean }}.information_schema.schemata + where + catalog_name = '{{ database_to_clean | upper }}' + and schema_name ilike 'DBT_CLOUD_PR%' + and last_altered <= (current_date() - interval '{{ age_in_days }} days') + {% endset %} + + {% if execute %} + + {{ log('Schema drop statements:' ,True) }} + + {% set schema_drop_list = run_query(find_old_schemas).columns[0].values() %} + + {% for schema_to_drop in schema_drop_list %} + {% do run_query(schema_to_drop) %} + {{ log(schema_to_drop ,True) }} + {% endfor %} + + {% endif %} + +{% endmacro %} +``` + +This macro goes into a dbt Cloud job that is run on a schedule. The command will look like this (text below for copy/paste): +![dbt Cloud job showing the run operation command for the cleanup macro](/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png) +`dbt run-operation pr_schema_cleanup --args "{ 'database_to_clean': 'development','age_in_days':15}"` diff --git a/website/docs/guides/orchestration/custom-cicd-pipelines/4-something-to-consider.md b/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md similarity index 97% rename from website/docs/guides/orchestration/custom-cicd-pipelines/4-something-to-consider.md rename to website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md index 180ab35d44f..6b39c5ce405 100644 --- a/website/docs/guides/orchestration/custom-cicd-pipelines/4-something-to-consider.md +++ b/website/docs/guides/orchestration/custom-cicd-pipelines/5-something-to-consider.md @@ -1,6 +1,6 @@ --- title: Something to Consider -id: 4-something-to-consider +id: 5-something-to-consider --- Running dbt Cloud jobs through a CI/CD pipeline is a form of job orchestration. If you also run jobs using dbt Cloud’s built in scheduler, you now have 2 orchestration tools running jobs. The risk with this is that you could run into conflicts - you can imagine a case where you are triggering a pipeline on certain actions and running scheduled jobs in dbt Cloud, you would probably run into job clashes. The more tools you have, the more you have to make sure everything talks to each other. diff --git a/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md b/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md new file mode 100644 index 00000000000..bb1045b3d2f --- /dev/null +++ b/website/docs/guides/orchestration/how-to-use-databricks-workflows-to-run-dbt-cloud-jobs.md @@ -0,0 +1,211 @@ +--- +title: "Use Databricks workflows to run dbt Cloud jobs" +id: how-to-use-databricks-workflows-to-run-dbt-cloud-jobs +description: Learn how to use Databricks workflows to run dbt Cloud jobs +displayText: "Use Databricks workflows to run dbt Cloud jobs" +hoverSnippet: Learn how to use Databricks workflows to run dbt Cloud jobs +--- + +Using Databricks workflows to call the dbt Cloud job API can be useful for several reasons: + +1. **Integration with other ETL processes** — If you're already running other ETL processes in Databricks, you can use a Databricks workflow to trigger a dbt Cloud job after those processes are done. +2. **Utilizes dbt Cloud jobs features —** dbt Cloud gives the ability to monitor job progress, manage historical logs and documentation, optimize model timing, and much [more](/docs/deploy/deploy-jobs). +3. [**Separation of concerns —**](https://en.wikipedia.org/wiki/Separation_of_concerns) Detailed logs for dbt jobs in the dbt Cloud environment can lead to more modularity and efficient debugging. By doing so, it becomes easier to isolate bugs quickly while still being able to see the overall status in Databricks. +4. **Custom job triggering —** Use a Databricks workflow to trigger dbt Cloud jobs based on custom conditions or logic that aren't natively supported by dbt Cloud's scheduling feature. This can give you more flexibility in terms of when and how your dbt Cloud jobs run. + +## Prerequisites + +- Active [Teams or Enterprise dbt Cloud account](https://www.getdbt.com/pricing/) +- You must have a configured and existing [dbt Cloud deploy job](/docs/deploy/deploy-jobs) +- Active Databricks account with access to [Data Science and Engineering workspace](https://docs.databricks.com/workspace-index.html) and [Manage secrets](https://docs.databricks.com/security/secrets/index.html) +- [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) + - **Note**: You only need to set up your authentication. Once you have set up your Host and Token and are able to run `databricks workspace ls /Users/`, you can proceed with the rest of this guide. + +## Configure Databricks workflows for dbt Cloud jobs + +To use Databricks workflows for running dbt Cloud jobs, you need to perform the following steps: + +- [Set up a Databricks secret scope](#set-up-a-databricks-secret-scope) +- [Create a Databricks Python notebook](#create-a-databricks-python-notebook) +- [Configure the workflows to run the dbt Cloud jobs](#configure-the-workflows-to-run-the-dbt-cloud-jobs) + +### Set up a Databricks secret scope + +1. Retrieve **[User API Token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens#user-api-tokens) **or **[Service Account Token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens#generating-service-account-tokens) **from dbt Cloud +2. Set up a **Databricks secret scope**, which is used to securely store your dbt Cloud API key. + +3. Enter the **following commands** in your terminal: + +```bash +# In this example we set up a secret scope and key called "dbt-cloud" and "api-key" respectively. +databricks secrets create-scope --scope +databricks secrets put --scope --key --string-value "" +``` + +4. Replace **``** and **``** with your own unique identifiers. Click [here](https://docs.databricks.com/security/secrets/index.html) for more information on secrets. + +5. Replace **``** with the actual API key value that you copied from dbt Cloud in step 1. + + +### Create a Databricks Python notebook + +1. [Create a **Databricks Python notebook**](https://docs.databricks.com/notebooks/notebooks-manage.html), which executes a Python script that calls the dbt Cloud job API. + +2. Write a **Python script** that utilizes the `requests` library to make an HTTP POST request to the dbt Cloud job API endpoint using the required parameters. Here's an example script: + +```python +import enum +import os +import time +import json +import requests +from getpass import getpass + +dbutils.widgets.text("job_id", "Enter the Job ID") +job_id = dbutils.widgets.get("job_id") + +account_id = +base_url = "" +api_key = dbutils.secrets.get(scope = "", key = "") + +# These are documented on the dbt Cloud API docs +class DbtJobRunStatus(enum.IntEnum): + QUEUED = 1 + STARTING = 2 + RUNNING = 3 + SUCCESS = 10 + ERROR = 20 + CANCELLED = 30 + +def _trigger_job() -> int: + res = requests.post( + url=f"https://{base_url}/api/v2/accounts/{account_id}/jobs/{job_id}/run/", + headers={'Authorization': f"Token {api_key}"}, + json={ + # Optionally pass a description that can be viewed within the dbt Cloud API. + # See the API docs for additional parameters that can be passed in, + # including `schema_override` + 'cause': f"Triggered by Databricks Workflows.", + } + ) + + try: + res.raise_for_status() + except: + print(f"API token (last four): ...{api_key[-4:]}") + raise + + response_payload = res.json() + return response_payload['data']['id'] + +def _get_job_run_status(job_run_id): + res = requests.get( + url=f"https://{base_url}/api/v2/accounts/{account_id}/runs/{job_run_id}/", + headers={'Authorization': f"Token {api_key}"}, + ) + + res.raise_for_status() + response_payload = res.json() + return response_payload['data']['status'] + +def run(): + job_run_id = _trigger_job() + print(f"job_run_id = {job_run_id}") + while True: + time.sleep(5) + status = _get_job_run_status(job_run_id) + print(DbtJobRunStatus(status)) + if status == DbtJobRunStatus.SUCCESS: + break + elif status == DbtJobRunStatus.ERROR or status == DbtJobRunStatus.CANCELLED: + raise Exception("Failure!") + +if __name__ == '__main__': + run() +``` + +3. Replace **``** and **``** with the values you used [previously](#set-up-a-databricks-secret-scope) + +4. Replace **``** and **``** with the correct values of your environment and [Access URL](/docs/cloud/about-cloud/regions-ip-addresses) for your region and plan. + +:::tip + To find these values, navigate to **dbt Cloud**, select **Deploy -> Jobs**. Select the Job you want to run and copy the URL. For example: `https://cloud.getdbt.com/deploy/000000/projects/111111/jobs/222222` +and therefore valid code would be: + + # Your URL is structured https:///deploy//projects//jobs/ + account_id = 000000 + job_id = 222222 + base_url = "cloud.getdbt.com" +::: + +5. Run the Notebook. It will fail, but you should see **a `job_id` widget** at the top of your notebook. + +6. In the widget, **enter your `job_id`** from step 4. + +7. **Run the Notebook again** to trigger the dbt Cloud job. Your results should look similar to the following: + +```bash +job_run_id = 123456 +DbtJobRunStatus.QUEUED +DbtJobRunStatus.QUEUED +DbtJobRunStatus.QUEUED +DbtJobRunStatus.STARTING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.RUNNING +DbtJobRunStatus.SUCCESS +``` + +:::note +You can cancel the job from dbt Cloud if necessary. +::: + +### Configure the workflows to run the dbt Cloud jobs + +You can set up workflows directly from the notebook OR by adding this notebook to one of your existing workflows: + + + + + +1. Click **Schedule** on the upper right side of the page +2. Click **Add a schedule** +3. Configure Job name, Schedule, Cluster +4. Add a new parameter called: `job_id` and fill in your job ID. Refer to [step 4 in previous section](#create-a-databricks-python-notebook) to find your job ID. +5. Click **Create** +6. Click **Run Now** to test the job + + + + + +1. Open Existing **Workflow** +2. Click **Tasks** +3. Press **“+” icon** to add a new task +4. Enter the **following**: + +| Field | Value | +|---|---| +| Task name | `` | +| Type | Notebook | +| Source | Workspace | +| Path | `` | +| Cluster | `` | +| Parameters | `job_id`: `` | + +5. Select **Save Task** +6. Click **Run Now** to test the workflow + + + + +Multiple Workflow tasks can be set up using the same notebook by configuring the `job_id` parameter to point to different dbt Cloud jobs. + +## Closing + +Using Databricks workflows to access the dbt Cloud job API can improve integration of your data pipeline processes and enable scheduling of more complex workflows. diff --git a/website/docs/guides/orchestration/set-up-ci/1-introduction.md b/website/docs/guides/orchestration/set-up-ci/1-introduction.md new file mode 100644 index 00000000000..97df16b4ce1 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/1-introduction.md @@ -0,0 +1,10 @@ +--- +title: "Get started with Continuous Integration tests" +slug: overview +--- + +By validating your code _before_ it goes into production, you don't need to spend your afternoon fielding messages from people whose reports are suddenly broken. + +A solid CI setup is critical to preventing avoidable downtime and broken trust. dbt Cloud uses **sensible defaults** to get you up and running in a performant and cost-effective way in minimal time. + +After that, there's time to get fancy, but let's walk before we run. diff --git a/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md b/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md new file mode 100644 index 00000000000..9b6d46fe2b2 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/2-quick-setup.md @@ -0,0 +1,50 @@ +--- +title: "Baseline: Enable CI in 15 minutes" +slug: in-15-minutes +description: Find issues before they are deployed to production with dbt Cloud's Slim CI. +--- + +In this guide, we're going to add a **CI environment**, where proposed changes can be validated in the context of the entire project without impacting production systems. We will use a single set of deployment credentials (like the Prod environment), but models are built in a separate location to avoid impacting others (like the Dev environment). + +Your git flow will look like this: + + +## Prerequisites + +As part of your initial dbt Cloud setup, you should already have Development and Production environments configured. Let's recap what each does: + +- Your **Development environment** powers the IDE. Each user has individual credentials, and builds into an individual dev schema. Nothing you do here impacts any of your colleagues. +- Your **Production environment** brings the canonical version of your project to life for downstream consumers. There is a single set of deployment credentials, and everything is built into your production schema(s). + +## Step 1: Create a new CI environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **CI**. Just like your existing Production environment, it will be a Deployment-type environment. + +When setting a Schema in the **Deployment Credentials** area, remember that dbt Cloud will automatically generate a custom schema name for each PR to ensure that they don't interfere with your deployed models. This means you can safely set the same Schema name as your Production job. + +## Step 2: Double-check your Production environment is identified + +Go into your existing Production environment, and ensure that the **Set as Production environment** checkbox is set. It'll make things easier later. + +## Step 3: Create a new job in the CI environment + +Use the **Continuous Integration Job** template, and call the job **CI Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. dbt Cloud uses the last successful run of any job in your Production environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). As long as you identified your Production environment in Step 2, you won't need to touch this. If you didn't, pick the right environment from the dropdown. + +## Step 4: Test your process + +That's it! There are other steps you can take to be even more confident in your work, such as [validating your structure follows best practices](/guides/orchestration/set-up-ci/run-dbt-project-evaluator) and [linting your code](/guides/orchestration/set-up-ci/lint-on-push), but this covers the most critical checks. + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft). Within a few seconds, you’ll see a new check appear in your git provider. + +## Things to keep in mind + +- If you make a new commit while a CI run based on older code is in progress, it will be automatically canceled and replaced with the fresh code. +- An unlimited number of CI jobs can run at once. If 10 developers all commit code to different PRs at the same time, each person will get their own schema containing their changes. Once each PR is merged, dbt Cloud will drop that schema. +- CI jobs will never block a production run. diff --git a/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md b/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md new file mode 100644 index 00000000000..646a9cb42b7 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/3-run-dbt-project-evaluator.md @@ -0,0 +1,46 @@ +--- +title: "Enforce best practices with dbt project evaluator" +slug: run-dbt-project-evaluator +description: dbt Project Evaluator can be run from inside of your existing dbt Cloud CI job to identify common flaws in projects. +--- + +dbt Project Evaluator is a package designed to identify deviations from best practices common to many dbt projects, including modeling, testing, documentation, structure and performance problems. For an introduction to the package, read its [launch blog post](/blog/align-with-dbt-project-evaluator). + +## Step 1: Install the package + +As with all packages, add a reference to `dbt-labs/dbt_project_evaluator` to your `packages.yml` file. See the [dbt Package Hub](https://hub.getdbt.com/dbt-labs/dbt_project_evaluator/latest/) for full installation instructions. + +## Step 2: Define test severity with an environment variable + +As noted in the [documentation](https://dbt-labs.github.io/dbt-project-evaluator/latest/ci-check/), tests in the package are set to `warn` severity by default. + +To have these tests fail in CI, create a new environment called `DBT_PROJECT_EVALUATOR_SEVERITY`. Set the project-wide default to `warn`, and set it to `error` in the CI environment. + +In your `dbt_project.yml` file, override the severity configuration: + +```yaml +tests: +dbt_project_evaluator: + +severity: "{{ env_var('DBT_PROJECT_EVALUATOR_SEVERITY', 'warn') }}" +``` + +## Step 3: Update your CI commands + +Because these tests should only run after the rest of your project has been built, your existing CI command will need to be updated to exclude the dbt_project_evaluator package. You will then add a second step which builds _only_ the package's models and tests. + +Update your steps to: + +```bash +dbt build --select state:modified+ --exclude package:dbt_project_evaluator +dbt build --select package:dbt_project_evaluator +``` + +## Step 4: Apply any customizations + +Depending on the state of your project when you roll out the evaluator, you may need to skip some tests or allow exceptions for some areas. To do this, refer to the documentation on: + +- [disabling tests](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/customization/) +- [excluding groups of models from a specific test](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/exceptions/) +- [excluding packages or sources/models based on path](https://dbt-labs.github.io/dbt-project-evaluator/latest/customization/excluding-packages-and-paths/) + +If you create a seed to exclude groups of models from a specific test, remember to disable the default seed and include `dbt_project_evaluator_exceptions` in your second `dbt build` command above. diff --git a/website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md b/website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md new file mode 100644 index 00000000000..1932ffe1019 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/4-lint-on-push.md @@ -0,0 +1,190 @@ +--- +title: "Run linting checks with SQLFluff" +slug: lint-on-push +description: Enforce your organization's SQL style guide with by running SQLFluff in your git workflow whenever new code is pushed. +--- + +By [linting](/docs/cloud/dbt-cloud-ide/lint-format#lint) your project during CI, you can ensure that code styling standards are consistently enforced, without spending human time nitpicking comma placement. + +The steps below create an action/pipeline which uses [SQLFluff](https://docs.sqlfluff.com/en/stable/) to scan your code and look for linting errors. If you don't already have SQLFluff rules defined, check out [our recommended config file](/guides/best-practices/how-we-style/2-how-we-style-our-sql). + +### 1. Create a YAML file to define your pipeline + +The YAML files defined below are what tell your code hosting platform the steps to run. In this setup, you’re telling the platform to run a SQLFluff lint job every time a commit is pushed. + + + + +GitHub Actions are defined in the `.github/workflows` directory. To define the job for your action, add a new file named `lint_on_push.yml` under the `workflows` folder. Your final folder structure will look like this: + +```sql +my_awesome_project +├── .github +│ ├── workflows +│ │ └── lint_on_push.yml +``` + +**Key pieces:** + +- `on:` defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other trigger options, check out [GitHub’s docs](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). +- `runs-on: ubuntu-latest` - this defines the operating system we’re using to run the job +- `uses:` - When the Ubuntu server is created, it is completely empty. [`checkout`](https://github.com/actions/checkout#checkout-v3) and [`setup-python`](https://github.com/actions/setup-python#setup-python-v3) are public GitHub Actions which enable the server to access the code in your repo, and set up Python correctly. +- `run:` - these steps are run at the command line, as though you typed them at a prompt yourself. This will install sqlfluff and lint the project. Be sure to set the correct `--dialect` for your project. + +For a full breakdown of the properties in a workflow file, see [Understanding the workflow file](https://docs.github.com/en/actions/learn-github-actions/understanding-github-actions#understanding-the-workflow-file) on GitHub's website. + +```yaml +name: lint dbt project on push + +on: + push: + branches-ignore: + - 'main' + +jobs: + # this job runs SQLFluff with a specific set of rules + # note the dialect is set to Snowflake, so make that specific to your setup + # details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html + lint_project: + name: Run SQLFluff linter + runs-on: ubuntu-latest + + steps: + - uses: "actions/checkout@v3" + - uses: "actions/setup-python@v4" + with: + python-version: "3.9" + - name: Install SQLFluff + run: "pip install sqlfluff" + - name: Lint project + run: "sqlfluff lint models --dialect snowflake" + +``` + + + + +Create a `.gitlab-ci.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. + +```sql +my_awesome_project +├── dbt_project.yml +├── .gitlab-ci.yml +``` + +**Key pieces:** + +- `image: python:3.9` - this defines the virtual image we’re using to run the job +- `rules:` - defines when the pipeline is run. This workflow will run whenever code is pushed to any branch except `main`. For other rules, refer to [GitLab’s documentation](https://docs.gitlab.com/ee/ci/yaml/#rules). +- `script:` - this is how we’re telling the GitLab runner to execute the Python script we defined above. + +```yaml +image: python:3.9 + +stages: + - pre-build + +# this job runs SQLFluff with a specific set of rules +# note the dialect is set to Snowflake, so make that specific to your setup +# details on linter rules: https://docs.sqlfluff.com/en/stable/rules.html +lint-project: + stage: pre-build + rules: + - if: $CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_BRANCH != 'main' + script: + - pip install sqlfluff + - sqlfluff lint models --dialect snowflake +``` + + + + +Create a `bitbucket-pipelines.yml` file in your **root directory** to define the triggers for when to execute the script below. You’ll put the code below into this file. + +```sql +my_awesome_project +├── bitbucket-pipelines.yml +├── dbt_project.yml +``` + +**Key pieces:** + +- `image: python:3.11.1` - this defines the virtual image we’re using to run the job +- `'**':` - this is used to filter when the pipeline runs. In this case we’re telling it to run on every push event, and you can see at line 12 we're creating a dummy pipeline for `main`. More information on filtering when a pipeline is run can be found in [Bitbucket's documentation](https://support.atlassian.com/bitbucket-cloud/docs/pipeline-triggers/) +- `script:` - this is how we’re telling the Bitbucket runner to execute the Python script we defined above. + +```yaml +image: python:3.11.1 + + +pipelines: + branches: + '**': # this sets a wildcard to run on every branch + - step: + name: Lint dbt project + script: + - pip install sqlfluff==0.13.1 + - sqlfluff lint models --dialect snowflake --rules L019,L020,L021,L022 + + 'main': # override if your default branch doesn't run on a branch named "main" + - step: + script: + - python --version +``` + + + + +### 2. Commit and push your changes to make sure everything works + +After you finish creating the YAML files, commit and push your code to trigger your pipeline for the first time. If everything goes well, you should see the pipeline in your code platform. When you click into the job you’ll get a log showing that SQLFluff was run. If your code failed linting you’ll get an error in the job with a description of what needs to be fixed. If everything passed the lint check, you’ll see a successful job run. + + + + +In your repository, click the *Actions* tab + +![Image showing the GitHub action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-github.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in GitHub for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-github.png) + + + + +In the menu option go to *CI/CD > Pipelines* + +![Image showing the GitLab action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-gitlab.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in GitLab for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-gitlab.png) + + + + +In the left menu pane, click on *Pipelines* + +![Image showing the Bitbucket action for lint on push](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png) + +Sample output from SQLFluff in the `Run SQLFluff linter` job: + +![Image showing the logs in Bitbucket for the SQLFluff run](/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png) + + + diff --git a/website/docs/guides/orchestration/set-up-ci/5-multiple-checks.md b/website/docs/guides/orchestration/set-up-ci/5-multiple-checks.md new file mode 100644 index 00000000000..4bfe2d936d4 --- /dev/null +++ b/website/docs/guides/orchestration/set-up-ci/5-multiple-checks.md @@ -0,0 +1,62 @@ +--- +title: "Advanced: Create a release train with additional environments" +slug: multiple-environments +description: Large and complex enterprises sometimes require additional layers of validation before deployment. Learn how to add these checks with dbt Cloud. +--- + +:::caution Are you sure you need this? +This approach can increase release safety, but creates additional manual steps in the deployment process as well as a greater maintenance burden. + +As such, it may slow down the time it takes to get new features into production. + +The team at Sunrun maintained a SOX-compliant deployment in dbt while reducing the number of environments. Check out [their Coalesce presentation](https://www.youtube.com/watch?v=vmBAO2XN-fM) to learn more. +::: + +In this section, we will add a new **QA** environment. New features will branch off from and be merged back into the associated `qa` branch, and a member of your team (the "Release Manager") will create a PR against `main` to be validated in the CI environment before going live. + +The git flow will look like this: + + +## Prerequisites + +- You have the **Development**, **CI**, and **Production** environments, as described in [the Baseline setup](/guides/orchestration/set-up-ci/in-15-minutes). + + +## Step 1: Create a `release` branch in your git repo + +As noted above, this branch will outlive any individual feature, and will be the base of all feature development for a period of time. Your team might choose to create a new branch for each sprint (`qa/sprint-01`, `qa/sprint-02`, etc), tie it to a version of your data product (`qa/1.0`, `qa/1.1`), or just have a single `qa` branch which remains active indefinitely. + +## Step 2: Update your Development environment to use the `qa` branch + +See [Custom branch behavior](/docs/dbt-cloud-environments#custom-branch-behavior). Setting `qa` as your custom branch ensures that the IDE creates new branches and PRs with the correct target, instead of using `main`. + + + +## Step 3: Create a new QA environment + +See [Create a new environment](/docs/dbt-cloud-environments#create-a-deployment-environment). The environment should be called **QA**. Just like your existing Production and CI environments, it will be a Deployment-type environment. + +Set its branch to `qa` as well. + +## Step 4: Create a new job + +Use the **Continuous Integration Job** template, and call the job **QA Check**. + +In the Execution Settings, your command will be preset to `dbt build --select state:modified+`. Let's break this down: + +- [`dbt build`](/reference/commands/build) runs all nodes (seeds, models, snapshots, tests) at once in DAG order. If something fails, nodes that depend on it will be skipped. +- The [`state:modified+` selector](/reference/node-selection/methods#the-state-method) means that only modified nodes and their children will be run ("Slim CI"). In addition to [not wasting time](https://discourse.getdbt.com/t/how-we-sped-up-our-ci-runs-by-10x-using-slim-ci/2603) building and testing nodes that weren't changed in the first place, this significantly reduces compute costs. + +To be able to find modified nodes, dbt needs to have something to compare against. Normally, we use the Production environment as the source of truth, but in this case there will be new code merged into `qa` long before it hits the `main` branch and Production environment. Because of this, we'll want to defer the Release environment to itself. + +### Optional: also add a compile-only job + +dbt Cloud uses the last successful run of any job in that environment as its [comparison state](/reference/node-selection/syntax#about-node-selection). If you have a lot of PRs in flight, the comparison state could switch around regularly. + +Adding a regularly-scheduled job inside of the QA environment whose only command is `dbt compile` can regenerate a more stable manifest for comparison purposes. + +## Step 5: Test your process + +When the Release Manager is ready to cut a new release, they will manually open a PR from `qa` into `main` from their git provider (e.g. GitHub, GitLab, Azure DevOps). dbt Cloud will detect the new PR, at which point the existing check in the CI environment will trigger and run. When using the [baseline configuration](/guides/orchestration/set-up-ci/in-15-minutes), it's possible to kick off the PR creation from inside of the dbt Cloud IDE. Under this paradigm, that button will create PRs targeting your QA branch instead. + +To test your new flow, create a new branch in the dbt Cloud IDE then add a new file or modify an existing one. Commit it, then create a new Pull Request (not a draft) against your `qa` branch. You'll see the integration tests begin to run. Once they complete, manually create a PR against `main`, and within a few seconds you’ll see the tests run again but this time incorporating all changes from all code that hasn't been merged to main yet. diff --git a/website/docs/guides/orchestration/webhooks/serverless-datadog.md b/website/docs/guides/orchestration/webhooks/serverless-datadog.md new file mode 100644 index 00000000000..6bd38869259 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/serverless-datadog.md @@ -0,0 +1,111 @@ +--- +title: "Create Datadog events from dbt Cloud results" +id: webhooks-guide-serverless-datadog +slug: serverless-datadog +description: Configure a serverless app to add Datadog logs +--- + +This guide will teach you how to build and host a basic Python app which will add dbt Cloud job events to Datadog. To do this, when a dbt Cloud job completes it will create a log entry for each node that was run, containing all information about the node provided by the [Discovery API](/docs/dbt-cloud-apis/discovery-schema-job-models). + +In this example, we will use [fly.io](https://fly.io) for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run). + +## Prerequisites +This guide assumes some familiarity with: +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- CLI apps +- Deploying code to a serverless code runner like fly.io or AWS Lambda + +## Integration steps + +### 1. Clone the `dbt-cloud-webhooks-datadog` repo + +[This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-datadog) contains the sample code for validating a webhook and creating logs in Datadog. + + +### 2. Install `flyctl` and sign up for fly.io + +Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: + +Switch to the directory containing the repo you cloned in step 1: +```shell +#example: replace with your actual path +cd ~/Documents/GitHub/dbt-cloud-webhooks-datadog +``` + +Sign up for fly.io: +```shell +flyctl auth signup +``` + +Your console should show `successfully logged in as YOUR_EMAIL` when you're done, but if it doesn't then sign in to fly.io from your command line: +```shell +flyctl auth login +``` + +### 3. Launch your fly.io app +Launching your app publishes it to the web and makes it ready to catch webhook events: +```shell +flyctl launch +``` + +You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. + +Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-datadog`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes. + +Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). + +When asked if you would like to set up Postgresql or Redis databases, type `n` for each. + +Type `y` when asked if you would like to deploy now. + +
          +Sample output from the setup wizard: + +joel@Joel-Labes dbt-cloud-webhooks-datadog % flyctl launch
          +An existing fly.toml file was found for app dbt-cloud-webhooks-datadog
          +? Would you like to copy its configuration to the new app? Yes
          +Creating app in /Users/joel/Documents/GitHub/dbt-cloud-webhooks-datadog
          +Scanning source code
          +Detected a Dockerfile app
          +? Choose an app name (leave blank to generate one): demo-dbt-cloud-webhook-datadog
          +automatically selected personal organization: Joel Labes
          +Some regions require a paid plan (fra, maa).
          +See https://fly.io/plans to set up a plan.
          +? Choose a region for deployment: [Use arrows to move, type to filter]
          +? Choose a region for deployment: Sydney, Australia (syd)
          +Created app dbtlabs-dbt-cloud-webhook-datadog in organization personal
          +Admin URL: https://fly.io/apps/demo-dbt-cloud-webhook-datadog
          +Hostname: demo-dbt-cloud-webhook-datadog.fly.dev
          +? Would you like to set up a Postgresql database now? No
          +? Would you like to set up an Upstash Redis database now? No
          +Wrote config file fly.toml
          +? Would you like to deploy now? Yes +
          +
          + +### 4. Create a Datadog API Key +[Create an API Key for your Datadog account](https://docs.datadoghq.com/account_management/api-app-keys/) and make note of it and your Datadog site (e.g. `datadoghq.com`) for later. + +### 5. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**. + +Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`) + +Make note of the Webhook Secret Key for later. + +*Do not test the endpoint*; it won't work until you have stored the auth keys (next step) + +### 6. Store secrets +The application requires four secrets to be set, using these names: +- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. +- `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. +- `DD_API_KEY`: the API key you created earlier. +- `DD_SITE`: The Datadog site for your organisation, e.g. `datadoghq.com`. + +Set these secrets as follows, replacing `abc123` etc with actual values: +```shell +flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 DD_API_KEY=ghi789 DD_SITE=datadoghq.com +``` + +### 7. Deploy your app +After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. diff --git a/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md b/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md new file mode 100644 index 00000000000..5455af60110 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/serverless-pagerduty.md @@ -0,0 +1,116 @@ +--- +title: "Create PagerDuty alarms from failed dbt Cloud tasks" +id: webhooks-guide-serverless-pagerduty +slug: serverless-pagerduty +description: Configure a serverless app to create PagerDuty alarms +--- + +This guide will teach you how to build and host a basic Python app which will monitor dbt Cloud jobs and create PagerDuty alarms based on failure. To do this, when a dbt Cloud job completes it will: + - Check for any failed nodes (e.g. non-passing tests or errored models), and + - create a PagerDuty alarm based on those nodes by calling the PagerDuty Events API. Events are deduplicated per run ID. + +![Screenshot of the PagerDuty UI, showing an alarm created by invalid SQL in a dbt model](/img/guides/orchestration/webhooks/serverless-pagerduty/pagerduty-example-alarm.png) + +In this example, we will use fly.io for hosting/running the service. fly.io is a platform for running full stack apps without provisioning servers etc. This level of usage should comfortably fit inside of the Free tier. You can also use an alternative tool such as [AWS Lambda](https://adem.sh/blog/tutorial-fastapi-aws-lambda-serverless) or [Google Cloud Run](https://github.com/sekR4/FastAPI-on-Google-Cloud-Run). + +## Prerequisites +This guide assumes some familiarity with: +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- CLI apps +- Deploying code to a serverless code runner like fly.io or AWS Lambda + +## Integration steps + +### 1. Clone the `dbt-cloud-webhooks-pagerduty` repo + +[This repository](https://github.com/dpguthrie/dbt-cloud-webhooks-pagerduty) contains the sample code for validating a webhook and creating events in PagerDuty. + + +### 2. Install `flyctl` and sign up for fly.io + +Follow the directions for your OS in the [fly.io docs](https://fly.io/docs/hands-on/install-flyctl/), then from your command line, run the following commands: + +Switch to the directory containing the repo you cloned in step 1: +```shell +#example: replace with your actual path +cd ~/Documents/GitHub/dbt-cloud-webhooks-pagerduty +``` + +Sign up for fly.io: +```shell +flyctl auth signup +``` + +Your console should show `successfully logged in as YOUR_EMAIL` when you're done, but if it doesn't then sign in to fly.io from your command line: +```shell +flyctl auth login +``` + +### 3. Launch your fly.io app +Launching your app publishes it to the web and makes it ready to catch webhook events: +```shell +flyctl launch +``` + +You will see a message saying that an existing `fly.toml` file was found. Type `y` to copy its configuration to your new app. + +Choose an app name of your choosing, such as `YOUR_COMPANY-dbt-cloud-webhook-pagerduty`, or leave blank and one will be generated for you. Note that your name can only contain numbers, lowercase letters and dashes. + +Choose a deployment region, and take note of the hostname that is generated (normally `APP_NAME.fly.dev`). + +When asked if you would like to set up Postgresql or Redis databases, type `n` for each. + +Type `y` when asked if you would like to deploy now. + +
          +Sample output from the setup wizard: + +joel@Joel-Labes dbt-cloud-webhooks-pagerduty % flyctl launch
          +An existing fly.toml file was found for app dbt-cloud-webhooks-pagerduty
          +? Would you like to copy its configuration to the new app? Yes
          +Creating app in /Users/joel/Documents/GitHub/dbt-cloud-webhooks-pagerduty
          +Scanning source code
          +Detected a Dockerfile app
          +? Choose an app name (leave blank to generate one): demo-dbt-cloud-webhook-pagerduty
          +automatically selected personal organization: Joel Labes
          +Some regions require a paid plan (fra, maa).
          +See https://fly.io/plans to set up a plan.
          +? Choose a region for deployment: [Use arrows to move, type to filter]
          +? Choose a region for deployment: Sydney, Australia (syd)
          +Created app dbtlabs-dbt-cloud-webhook-pagerduty in organization personal
          +Admin URL: https://fly.io/apps/demo-dbt-cloud-webhook-pagerduty
          +Hostname: demo-dbt-cloud-webhook-pagerduty.fly.dev
          +? Would you like to set up a Postgresql database now? No
          +? Would you like to set up an Upstash Redis database now? No
          +Wrote config file fly.toml
          +? Would you like to deploy now? Yes +
          +
          + +### 4. Create a PagerDuty integration application +See [PagerDuty's guide](https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgw-events-api-v2-overview#getting-started) for full instructions. + +Make note of the integration key for later. + +### 5. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**. + +Set the webhook URL to the host name you created earlier (`APP_NAME.fly.dev`) + +Make note of the Webhook Secret Key for later. + +*Do not test the endpoint*; it won't work until you have stored the auth keys (next step) + +### 6. Store secrets +The application requires three secrets to be set, using these names: +- `DBT_CLOUD_SERVICE_TOKEN`: a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens) with at least the `Metdata Only` permission. +- `DBT_CLOUD_AUTH_TOKEN`: the Secret Key for the dbt Cloud webhook you created earlier. +- `PD_ROUTING_KEY`: the integration key for the PagerDuty integration you created earlier. + +Set these secrets as follows, replacing `abc123` etc with actual values: +```shell +flyctl secrets set DBT_CLOUD_SERVICE_TOKEN=abc123 DBT_CLOUD_AUTH_TOKEN=def456 PD_ROUTING_KEY=ghi789 +``` + +### 7. Deploy your app +After you set your secrets, fly.io will redeploy your application. When it has completed successfully, go back to the dbt Cloud webhook settings and click **Test Endpoint**. \ No newline at end of file diff --git a/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md b/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md new file mode 100644 index 00000000000..bb3f03ef0c0 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/zapier-ms-teams.md @@ -0,0 +1,159 @@ +--- +title: "Post to Microsoft Teams when a job finishes" +id: webhooks-guide-zapier-ms-teams +slug: zapier-ms-teams +description: Use Zapier and the dbt Cloud API to post to Microsoft Teams +--- + +This guide will show you how to set up an integration between dbt Cloud jobs and Microsoft Teams using [dbt Cloud Webhooks](/docs/deploy/webhooks) and Zapier, similar to the [native Slack integration](/faqs/accounts/slack). + +When a dbt Cloud job finishes running, the integration will: + + - Receive a webhook notification in Zapier, + - Extract the results from the dbt Cloud admin API, and + - Post a summary to a Microsoft Teams channel. + +![Screenshot of a message in MS Teams showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png) +## Prerequisites + +In order to set up the integration, you should have familiarity with: +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- Zapier +## Integration steps +### 1. Set up the connection between Zapier and Microsoft Teams + +* Install the [Zapier app in Microsoft Teams](https://appsource.microsoft.com/en-us/product/office/WA200002044) and [grant Zapier access to your account](https://zapier.com/blog/how-to-automate-microsoft-teams/). + +**Note**: To receive the message, add the Zapier app to the team's channel during installation. + +### 2. Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. + +Press **Continue**, then copy the webhook URL. + +![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) + +### 3. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose either **Run completed** or **Run errored**, but not both, or you'll get double messages when a run fails. + +Make note of the Webhook Secret Key for later. + +Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test Trigger**, which will create a sample webhook body based on the test event dbt Cloud sent. + +The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. + +### 4. Store secrets +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + + + +### 5. Add a code action +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-common/run-python.png) + +In the **Code** field, paste the following code, replacing `YOUR_SECRET_HERE` with the secret you created when setting up the Storage by Zapier integration. Remember that this is not your dbt Cloud secret. + +The code below will validate the authenticity of the request, extract the run logs for the completed job from the Admin API, and then build a summary message that pulls out any error messages from the end-of-invocation logs created by dbt Core. + +```python +import hashlib +import hmac +import json +import re + + +auth_header = input_data['auth_header'] +raw_body = input_data['raw_body'] + +# Access secret credentials +secret_store = StoreClient('YOUR_SECRET_HERE') +hook_secret = secret_store.get('DBT_WEBHOOK_KEY') +api_token = secret_store.get('DBT_CLOUD_SERVICE_TOKEN') + +# Validate the webhook came from dbt Cloud +signature = hmac.new(hook_secret.encode('utf-8'), raw_body.encode('utf-8'), hashlib.sha256).hexdigest() + +if signature != auth_header: + raise Exception("Calculated signature doesn't match contents of the Authorization header. This webhook may not have been sent from dbt Cloud.") + +full_body = json.loads(raw_body) +hook_data = full_body['data'] + +# Steps derived from these commands won't have their error details shown inline, as they're messy +commands_to_skip_logs = ['dbt source', 'dbt docs'] + +# When testing, you will want to hardcode run_id and account_id to IDs that exist; the sample webhook won't work. +run_id = hook_data['runId'] +account_id = full_body['accountId'] + +# Fetch run info from the dbt Cloud Admin API +url = f'https://cloud.getdbt.com/api/v2/accounts/{account_id}/runs/{run_id}/?include_related=["run_steps"]' +headers = {'Authorization': f'Token {api_token}'} +run_data_response = requests.get(url, headers=headers) +run_data_response.raise_for_status() +run_data_results = run_data_response.json()['data'] + +# Overall run summary +outcome_message = f""" +**\[{hook_data['runStatus']} for Run #{run_id} on Job \"{hook_data['jobName']}\"]({run_data_results['href']})** + + +**Environment:** {hook_data['environmentName']} | **Trigger:** {hook_data['runReason']} | **Duration:** {run_data_results['duration_humanized']} + +""" + +# Step-specific summaries +for step in run_data_results['run_steps']: + if step['status_humanized'] == 'Success': + outcome_message += f""" +✅ {step['name']} ({step['status_humanized']} in {step['duration_humanized']}) +""" + else: + outcome_message += f""" +❌ {step['name']} ({step['status_humanized']} in {step['duration_humanized']}) +""" + show_logs = not any(cmd in step['name'] for cmd in commands_to_skip_logs) + if show_logs: + full_log = step['logs'] + # Remove timestamp and any colour tags + full_log = re.sub('\x1b?\[[0-9]+m[0-9:]*', '', full_log) + + summary_start = re.search('(?:Completed with \d+ errors? and \d+ warnings?:|Database Error|Compilation Error|Runtime Error)', full_log) + + line_items = re.findall('(^.*(?:Failure|Error) in .*\n.*\n.*)', full_log, re.MULTILINE) + + if len(line_items) == 0: + relevant_log = f'```{full_log[summary_start.start() if summary_start else 0:]}```' + else: + relevant_log = summary_start[0] + for item in line_items: + relevant_log += f'\n```\n{item.strip()}\n```\n' + outcome_message += f""" +{relevant_log} +""" + +# Zapier looks for the `output` dictionary for use in subsequent steps +output = {'outcome_message': outcome_message} +``` + +### 6. Add the Microsoft Teams action +Select **Microsoft Teams** as the App, and **Send Channel Message** as the Action. + +In the **Set up action** area, choose the team and channel. Set the **Message Text Format** to **markdown**, then put **2. Outcome Message** from the Run Python in Code by Zapier output into the **Message Text** field. + +![Screenshot of the Zapier UI, showing the mappings of prior steps to an MS Teams message](/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png) + +### 7. Test and deploy +As you have gone through each step, you should have tested the outputs, so you can now try posting a message into your Teams channel. + +When you're happy with it, remember to ensure that your `run_id` and `account_id` are no longer hardcoded, then publish your Zap. + +## Other notes +- If you post to a chat instead of a team channel, you don't need to add the Zapier app to Microsoft Teams. +- If you post to a chat instead of a team channel, note that markdown is not supported and you will need to remove the markdown formatting. +- If you chose the **Catch Hook** trigger instead of **Catch Raw Hook**, you will need to pass each required property from the webhook as an input instead of running `json.loads()` against the raw body. You will also need to remove the validation code. diff --git a/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md b/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md new file mode 100644 index 00000000000..0764c6c7911 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/zapier-new-cloud-job.md @@ -0,0 +1,91 @@ +--- +title: "Trigger a dbt Cloud job after a run finishes" +id: webhooks-guide-zapier-new-cloud-job +slug: zapier-new-cloud-job +description: Use Zapier to interact with the dbt Cloud API +--- + +This guide will show you how to trigger a dbt Cloud job based on the successful completion of a different job. This can be useful when you need to trigger a job in a different project. Remember that dbt works best when it understands the whole context of the it has been asked to run, so use this ability judiciously. + +## Prerequisites + +In order to set up the integration, you should have familiarity with: +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- Zapier + +## Integration steps + +### 1. Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. + +Press **Continue**, then copy the webhook URL. + +![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) + +### 2. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain the job you want to trigger the next run. + +Make note of the Webhook Secret Key for later. + +Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test Trigger**, which will create a sample webhook body based on the test event dbt Cloud sent. + +The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. + +### 3. Store secrets +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + + + +### 4. Add a code action +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-common/run-python.png) + +In the **Code** field, paste the following code, replacing `YOUR_SECRET_HERE` with the secret you created when setting up the Storage by Zapier integration. Remember that this is not your dbt Cloud secret. + +The code below will validate the authenticity of the request, then send a [`trigger run` command to the dbt Cloud API](https://docs.getdbt.com/dbt-cloud/api-v2-legacy#tag/Jobs/operation/triggerRun) for the given job ID. + +```python +import hashlib +import hmac +import json + +#replace with the Job ID you want to run +target_job_id = 12345 + +auth_header = input_data['auth_header'] +raw_body = input_data['raw_body'] + +# Access secret credentials +secret_store = StoreClient('YOUR_SECRET_HERE') +hook_secret = secret_store.get('DBT_WEBHOOK_KEY') +api_token = secret_store.get('DBT_CLOUD_SERVICE_TOKEN') + +# Validate the webhook came from dbt Cloud +signature = hmac.new(hook_secret.encode('utf-8'), raw_body.encode('utf-8'), hashlib.sha256).hexdigest() + +if signature != auth_header: + raise Exception("Calculated signature doesn't match contents of the Authorization header. This webhook may not have been sent from dbt Cloud.") + +full_body = json.loads(raw_body) +hook_data = full_body['data'] + +if hook_data['runStatus'] == "Success": + + # Trigger a new run with the dbt Cloud Admin API + url = f'https://cloud.getdbt.com/api/v2/accounts/{full_body['accountId']}/jobs/{target_job_id}/run' + + body = {'cause': f"Triggered by Zapier because {hook_data['jobName']} Run #{hook_data['runId']} completed successfully"} + headers = {'Authorization': f'Token {api_token}'} + response = requests.post(url, json=body, headers=headers) + response.raise_for_status() + +return +``` + +### 5. Test and deploy +When you're happy with it, remember to ensure that your `account_id` is no longer hardcoded, then publish your Zap. diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md b/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md new file mode 100644 index 00000000000..f682baae8e2 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/zapier-refresh-mode-report.md @@ -0,0 +1,128 @@ +--- +title: "Refresh a Mode dashboard when a job completes" +id: webhooks-guide-zapier-refresh-mode-report +slug: zapier-refresh-mode-report +description: Use Zapier to trigger a Mode dashboard refresh +--- + +This guide will teach you how to refresh a Mode dashboard when a dbt Cloud job has completed successfully and there is fresh data available. The integration will: + + - Receive a webhook notification in Zapier + - Trigger a refresh of a Mode report + +Although we are using the Mode API for a concrete example, the principles are readily transferrable to your [tool](https://learn.hex.tech/docs/develop-logic/hex-api/api-reference#operation/RunProject) [of](https://learn.microsoft.com/en-us/rest/api/power-bi/datasets/refresh-dataset) [choice](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref.htm#update_workbook_now). + +## Prerequisites + +In order to set up the integration, you should have familiarity with: +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- Zapier +- The [Mode API](https://mode.com/developer/api-reference/introduction/) + +## Integration steps + +### 1. Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. + +Press **Continue**, then copy the webhook URL. + +![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) + +### 2. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Your event should be **Run completed**, and you need to change the **Jobs** list to only contain any jobs whose completion should trigger a report refresh. + +Make note of the Webhook Secret Key for later. + +Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test Trigger**, which will create a sample webhook body based on the test event dbt Cloud sent. + +The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. + +### 3. Store secrets +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens), as well as a [Mode API token and secret](https://mode.com/developer/api-reference/authentication/). + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + + +This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `MODE_API_TOKEN`, and `MODE_API_SECRET`. If you are using different names, make sure you update all references to them in the sample code. + +This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3). + +#### a. Create a Storage by Zapier connection +If you haven't already got one, go to and create a new connection. Remember the UUID secret you generate for later. + +#### b. Add a temporary code step +Choose **Run Python** as the Event. Run the following code: +```python +store = StoreClient('abc123') #replace with your UUID secret +store.set('DBT_WEBHOOK_KEY', 'abc123') #replace with your dbt Cloud API token +store.set('MODE_API_TOKEN', 'abc123') #replace with your Mode API Token +store.set('MODE_API_SECRET', 'abc123') #replace with your Mode API Secret +``` +Test the step. You can delete this Action when the test succeeds. The key will remain stored as long as it is accessed at least once every three months. + +### 4. Add a code action +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-common/run-python.png) + +In the **Code** field, paste the following code, replacing `YOUR_SECRET_HERE` in the StoreClient constructor with the secret you created when setting up the Storage by Zapier integration (not your dbt Cloud secret), and setting the `account_username` and `report_token` variables to actual values. + +The code below will validate the authenticity of the request, then send a [`run report` command to the Mode API](https://mode.com/developer/api-reference/analytics/report-runs/#runReport) for the given report token. + +```python +import hashlib +import hmac +import json + +#replace with the report token you want to run +account_username = 'YOUR_MODE_ACCOUNT_USERNAME_HERE' +report_token = 'YOUR_REPORT_TOKEN_HERE' + +auth_header = input_data['auth_header'] +raw_body = input_data['raw_body'] + +# Access secret credentials +secret_store = StoreClient('YOUR_SECRET_HERE') +hook_secret = secret_store.get('DBT_WEBHOOK_KEY') +username = secret_store.get('MODE_API_TOKEN') +password = secret_store.get('MODE_API_SECRET') + +# Validate the webhook came from dbt Cloud +signature = hmac.new(hook_secret.encode('utf-8'), raw_body.encode('utf-8'), hashlib.sha256).hexdigest() + +if signature != auth_header: + raise Exception("Calculated signature doesn't match contents of the Authorization header. This webhook may not have been sent from dbt Cloud.") + +full_body = json.loads(raw_body) +hook_data = full_body['data'] + +if hook_data['runStatus'] == "Success": + + # Create a report run with the Mode API + url = f'https://app.mode.com/api/{account_username}/reports/{report_token}/run' + + params = { + 'parameters': { + "user_id": 123, + "location": "San Francisco" + } + } + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/hal+json' + } + response = requests.post( + url, + json=params, + headers=headers, + auth=HTTPBasicAuth(username, password) + ) + response.raise_for_status() + +return +``` + +### 5. Test and deploy +You can iterate on the Code step by modifying the code and then running the test again. When you're happy with it, you can publish your Zap. \ No newline at end of file diff --git a/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md b/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md new file mode 100644 index 00000000000..52a9ae63523 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/zapier-refresh-tableau-workbook.md @@ -0,0 +1,165 @@ +--- +title: "Refresh Tableau workbook with extracts after a job finishes" +id: webhooks-guide-zapier-refresh-tableau-workbook +slug: zapier-refresh-tableau-workbook +description: Use Zapier to trigger a Tableau workbook refresh +--- + +This guide will teach you how to refresh a Tableau workbook that leverages [extracts](https://help.tableau.com/current/pro/desktop/en-us/extracting_data.htm) when a dbt Cloud job has completed successfully and there is fresh data available. The integration will: + + - Receive a webhook notification in Zapier + - Trigger a refresh of a Tableau workbook + +## Prerequisites + +To set up the integration, you need to be familiar with: + +- [dbt Cloud Webhooks](/docs/deploy/webhooks) +- Zapier +- The [Tableau API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api.htm) +- The [version](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_concepts_versions.htm#rest_api_versioning) of Tableau's REST API that is compatible with your server + +## Integration steps + +### 1. Obtain authentication credentials from Tableau +To authenticate with the Tableau API, obtain a [Personal Access Token](https://help.tableau.com/current/server/en-us/security_personal_access_tokens.htm) from your Tableau Server/Cloud instance. In addition, make sure your Tableau workbook uses data sources that allow refresh access, which is usually set when publishing. + +### 2. Create a new Zap in Zapier +To trigger an action with the delivery of a webhook in Zapier, you'll want to create a new Zap with **Webhooks by Zapier** as the Trigger and **Catch Raw Hook** as the Event. However, if you choose not to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook), which isn't recommended, you can choose **Catch Hook** instead. + +Press **Continue**, then copy the webhook URL. + +![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) + +### 3. Configure a new webhook in dbt Cloud +To set up a webhook subscription for dbt Cloud, follow the instructions in [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription). For the event, choose **Run completed** and modify the **Jobs** list to include only the jobs that should trigger a report refresh. + +Remember to save the Webhook Secret Key for later. Paste in the webhook URL obtained from Zapier in step 2 into the **Endpoint** field and test the endpoint. + +Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test Trigger**, which will create a sample webhook body based on the test event dbt Cloud sent. + +The sample body's values are hard-coded and not reflective of your project, but they give Zapier a correctly-shaped object during development. + +### 4. Store secrets +In the next step, you will need the Webhook Secret Key from the prior step, and your Tableau authentication credentials and details. Specifically, you'll need your Tableau server/site URL, server/site name, PAT name, and PAT secret. + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps), which prevents your keys from being displayed in plaintext in the Zap code. You will be able to access them via the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + +This guide assumes the names for the secret keys are: `DBT_WEBHOOK_KEY`, `TABLEAU_SITE_URL`, `TABLEAU_SITE_NAME`, `TABLEAU_API_TOKEN_NAME`, and `TABLEAU_API_TOKEN_SECRET`. If you are using different names, make sure you update all references to them in the sample code. + +This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3). + +#### a. Create a Storage by Zapier connection + +Create a new connection at https://zapier.com/app/connections/storage if you don't already have one and remember the UUID secret you generate for later. + +#### b. Add a temporary code step + +Choose **Run Python** as the Event and input the following code: + +```python +store = StoreClient('abc123') #replace with your UUID secret +store.set('DBT_WEBHOOK_KEY', 'abc123') #replace with your dbt Cloud Webhook key +store.set('TABLEAU_SITE_URL', 'abc123') #replace with your Tableau Site URL, inclusive of https:// and .com +store.set('TABLEAU_SITE_NAME', 'abc123') #replace with your Tableau Site/Server Name +store.set('TABLEAU_API_TOKEN_NAME', 'abc123') #replace with your Tableau API Token Name +store.set('TABLEAU_API_TOKEN_SECRET', 'abc123') #replace with your Tableau API Secret +``` + +Test the step to run the code. You can delete this action when the test succeeds. The keys will remain stored as long as it is accessed at least once every three months. + +### 5. Add a code action +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +In the **Set up action** area, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the **Catch Raw Hook** step above. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-common/run-python.png) + +In the **Code** field, paste the following code, replacing `YOUR_STORAGE_SECRET_HERE` in the StoreClient constructor with the UUID secret you created when setting up the Storage by Zapier integration, and replacing the `workbook_name` and `api_version` variables to actual values. + +The following code validates the authenticity of the request and obtains the workbook ID for the specified workbook name. Next, the code will send a [`update workbook` command to the Tableau API](https://help.tableau.com/current/api/rest_api/en-us/REST/rest_api_ref_workbooks_and_views.htm#update_workbook_now) for the given workbook ID. + +```python +import requests +import hashlib +import json +import hmac + +# Access secret credentials +secret_store = StoreClient('YOUR_STORAGE_SECRET_HERE') +hook_secret = secret_store.get('DBT_WEBHOOK_KEY') +server_url = secret_store.get('TABLEAU_SITE_URL') +server_name = secret_store.get('TABLEAU_SITE_NAME') +pat_name = secret_store.get('TABLEAU_API_TOKEN_NAME') +pat_secret = secret_store.get('TABLEAU_API_TOKEN_SECRET') + +#Enter the name of the workbook to refresh +workbook_name = "YOUR_WORKBOOK_NAME" +api_version = "ENTER_COMPATIBLE_VERSION" + +#Validate authenticity of webhook coming from dbt Cloud +auth_header = input_data['auth_header'] +raw_body = input_data['raw_body'] + +signature = hmac.new(hook_secret.encode('utf-8'), raw_body.encode('utf-8'), hashlib.sha256).hexdigest() + +if signature != auth_header: +raise Exception("Calculated signature doesn't match contents of the Authorization header. This webhook may not have been sent from dbt Cloud.") + +full_body = json.loads(raw_body) +hook_data = full_body['data'] + +if hook_data['runStatus'] == "Success": + +#Authenticate with Tableau Server to get an authentication token +auth_url = f"{server_url}/api/{api_version}/auth/signin" +auth_data = { + "credentials": { + "personalAccessTokenName": pat_name, + "personalAccessTokenSecret": pat_secret, + "site": { + "contentUrl": server_name + } + } +} +auth_headers = { + "Accept": "application/json", + "Content-Type": "application/json" +} +auth_response = requests.post(auth_url, data=json.dumps(auth_data), headers=auth_headers) + +#Extract token to use for subsequent calls +auth_token = auth_response.json()["credentials"]["token"] +site_id = auth_response.json()["credentials"]["site"]["id"] + +#Extract the workbook ID +workbooks_url = f"{server_url}/api/{api_version}/sites/{site_id}/workbooks" +workbooks_headers = { + "Accept": "application/json", + "Content-Type": "application/json", + "X-Tableau-Auth": auth_token +} +workbooks_params = { + "filter": f"name:eq:{workbook_name}" +} +workbooks_response = requests.get(workbooks_url, headers=workbooks_headers, params=workbooks_params) + +#Assign workbook ID +workbooks_data = workbooks_response.json() +workbook_id = workbooks_data["workbooks"]["workbook"][0]["id"] + +# Refresh the workbook +refresh_url = f"{server_url}/api/{api_version}/sites/{site_id}/workbooks/{workbook_id}/refresh" +refresh_data = {} +refresh_headers = { + "Accept": "application/json", + "Content-Type": "application/json", + "X-Tableau-Auth": auth_token +} + +refresh_trigger = requests.post(refresh_url, data=json.dumps(refresh_data), headers=refresh_headers) +return {"message": "Workbook refresh has been queued"} +``` + +### 6. Test and deploy +To make changes to your code, you can modify it and test it again. When you're happy with it, you can publish your Zap. diff --git a/website/docs/guides/orchestration/webhooks/zapier-slack.md b/website/docs/guides/orchestration/webhooks/zapier-slack.md new file mode 100644 index 00000000000..c9046ee9943 --- /dev/null +++ b/website/docs/guides/orchestration/webhooks/zapier-slack.md @@ -0,0 +1,304 @@ +--- +title: "Post to Slack with error context when a job fails" +id: webhooks-guide-zapier-slack +slug: zapier-slack +description: Use Zapier and the dbt Cloud API to post error context to Slack +--- + +This guide will show you how to set up an integration between dbt Cloud jobs and Slack using [dbt Cloud webhooks](/docs/deploy/webhooks) and Zapier. It builds on the native [native Slack integration](/faqs/accounts/slack) by attaching error message details of models and tests in a thread. + +Note: Because there is not a webhook for Run Cancelled, you may want to keep the standard Slack integration installed to receive those notifications. You could also use the [alternative integration](#alternate-approach) that augments the native integration without replacing it. + +When a dbt Cloud job finishes running, the integration will: + + - Receive a webhook notification in Zapier + - Extract the results from the dbt Cloud admin API + - Post a brief summary of the run to a Slack channel + - Create a threaded message attached to that post which contains any reasons that the job failed + +![Screenshot of a message in Slack showing a summary of a dbt Cloud run which failed](/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png) +## Prerequisites + +In order to set up the integration, you should have familiarity with: +- [dbt Cloud webhooks](/docs/deploy/webhooks) +- Zapier +## Integration steps + +### 1. Create a new Zap in Zapier +Use **Webhooks by Zapier** as the Trigger, and **Catch Raw Hook** as the Event. If you don't intend to [validate the authenticity of your webhook](/docs/deploy/webhooks#validate-a-webhook) (not recommended!) then you can choose **Catch Hook** instead. + +Click **Continue**, then copy the webhook URL. + +![Screenshot of the Zapier UI, showing the webhook URL ready to be copied](/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png) + +### 2. Configure a new webhook in dbt Cloud +See [Create a webhook subscription](/docs/deploy/webhooks#create-a-webhook-subscription) for full instructions. Choose **Run completed** as the Event. You can alternatively choose **Run errored**, but you will need to account for the fact that the necessary metadata [might not be available immediately](/docs/deploy/webhooks#completed-errored-event-difference). + +Remember the Webhook Secret Key for later. + +Once you've tested the endpoint in dbt Cloud, go back to Zapier and click **Test Trigger**. This creates a sample webhook body based on the test event dbt Cloud sent. + +The sample body's values are hardcoded and not reflective of your project, but they give Zapier a correctly-shaped object during development. + +### 3. Store secrets +In the next step, you will need the Webhook Secret Key from the prior step, and a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + + + + +### 4. Add a code action +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +In the **Set up action** section, add two items to **Input Data**: `raw_body` and `auth_header`. Map those to the `1. Raw Body` and `1. Headers Http Authorization` fields from the previous **Catch Raw Hook** step. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-common/run-python.png) + +In the **Code** field, paste the following code, replacing `YOUR_SECRET_HERE` with the secret you created when setting up the Storage by Zapier integration. Remember that this is not your dbt Cloud secret. + +This example code validates the authenticity of the request, extracts the run logs for the completed job from the Admin API, and then builds two messages: a summary message containing the outcome of each step and its duration, and a message for inclusion in a thread displaying any error messages extracted from the end-of-invocation logs created by dbt Core. + +```python +import hashlib +import hmac +import json +import re + + +auth_header = input_data['auth_header'] +raw_body = input_data['raw_body'] + +# Access secret credentials +secret_store = StoreClient('YOUR_SECRET_HERE') +hook_secret = secret_store.get('DBT_WEBHOOK_KEY') +api_token = secret_store.get('DBT_CLOUD_SERVICE_TOKEN') + +# Validate the webhook came from dbt Cloud +signature = hmac.new(hook_secret.encode('utf-8'), raw_body.encode('utf-8'), hashlib.sha256).hexdigest() + +if signature != auth_header: + raise Exception("Calculated signature doesn't match contents of the Authorization header. This webhook may not have been sent from dbt Cloud.") + +full_body = json.loads(raw_body) +hook_data = full_body['data'] + +# Steps derived from these commands won't have their error details shown inline, as they're messy +commands_to_skip_logs = ['dbt source', 'dbt docs'] + +# When testing, you will want to hardcode run_id and account_id to IDs that exist; the sample webhook won't work. +run_id = hook_data['runId'] +account_id = full_body['accountId'] + +# Fetch run info from the dbt Cloud Admin API +url = f'https://cloud.getdbt.com/api/v2/accounts/{account_id}/runs/{run_id}/?include_related=["run_steps"]' +headers = {'Authorization': f'Token {api_token}'} +run_data_response = requests.get(url, headers=headers) +run_data_response.raise_for_status() +run_data_results = run_data_response.json()['data'] + +# Overall run summary +step_summary_post = f""" +*\[{hook_data['runStatus']} for Run #{run_id} on Job \"{hook_data['jobName']}\"]({run_data_results['href']})* + +*Environment:* {hook_data['environmentName']} | *Trigger:* {hook_data['runReason']} | *Duration:* {run_data_results['duration_humanized']} + +""" + +threaded_errors_post = "" + +# Step-specific summaries +for step in run_data_results['run_steps']: + if step['status_humanized'] == 'Success': + step_summary_post += f""" +✅ {step['name']} ({step['status_humanized']} in {step['duration_humanized']}) +""" + else: + step_summary_post += f""" +❌ {step['name']} ({step['status_humanized']} in {step['duration_humanized']}) +""" + + # Don't try to extract info from steps that don't have well-formed logs + show_logs = not any(cmd in step['name'] for cmd in commands_to_skip_logs) + if show_logs: + full_log = step['logs'] + # Remove timestamp and any colour tags + full_log = re.sub('\x1b?\[[0-9]+m[0-9:]*', '', full_log) + + summary_start = re.search('(?:Completed with \d+ errors? and \d+ warnings?:|Database Error|Compilation Error|Runtime Error)', full_log) + + line_items = re.findall('(^.*(?:Failure|Error) in .*\n.*\n.*)', full_log, re.MULTILINE) + + if not summary_start: + continue + + threaded_errors_post += f""" +*{step['name']}* +""" + # If there are no line items, the failure wasn't related to dbt nodes, and we want the whole rest of the message. + # If there are, then we just want the summary line and then to log out each individual node's error. + if len(line_items) == 0: + relevant_log = f'```{full_log[summary_start.start():]}```' + else: + relevant_log = summary_start[0] + for item in line_items: + relevant_log += f'\n```\n{item.strip()}\n```\n' + threaded_errors_post += f""" +{relevant_log} +""" + +send_error_thread = len(threaded_errors_post) > 0 + +# Zapier looks for the `output` dictionary for use in subsequent steps +output = {'step_summary_post': step_summary_post, 'send_error_thread': send_error_thread, 'threaded_errors_post': threaded_errors_post} +``` + +### 5. Add Slack actions in Zapier +Select **Slack** as the App, and **Send Channel Message** as the Action. + +In the **Action** section, choose which **Channel** to post to. Set the **Message Text** field to **2. Step Summary Post** from the Run Python in Code by Zapier output. + +Configure the other options as you prefer (for example, **Bot Name** and **Bot Icon**). + +![Screenshot of the Zapier UI, showing the mappings of prior steps to a Slack message](/img/guides/orchestration/webhooks/zapier-slack/parent-slack-config.png) + +Add another step, **Filter**. In the **Filter setup and testing** section, set the **Field** to **2. Send Error Thread** and the **condition** to **(Boolean) Is true**. This prevents the Zap from failing if the job succeeded and you try to send an empty Slack message in the next step. + +![Screenshot of the Zapier UI, showing the correctly configured Filter step](/img/guides/orchestration/webhooks/zapier-slack/filter-config.png) + +Add another **Send Channel Message in Slack** action. In the **Action** section, choose the same channel as last time, but set the **Message Text** to **2. Threaded Errors Post** from the same Run Python step. Set the **Thread** value to **3. Message Ts**, which is the timestamp of the post created by the first Slack action. This tells Zapier to add this post as a threaded reply to the main message, which prevents the full (potentially long) output from cluttering your channel. + +![Screenshot of the Zapier UI, showing the mappings of prior steps to a Slack message](/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png) + +### 7. Test and deploy + +When you're done testing your Zap, make sure that your `run_id` and `account_id` are no longer hardcoded in the Code step, then publish your Zap. + +## Alternate approach + +Instead of using a webhook as your trigger, you can keep the existing dbt Cloud app installed in your Slack workspace and use its messages being posted to your channel as the trigger. In this case, you can skip validating the webhook and only need to load the context from the thread. + +### 1. Create a new Zap in Zapier +Use **Slack** as the initiating app, and **New Message Posted to Channel** as the Trigger. In the **Trigger** section, select the channel where your Slack alerts are being posted, and set **Trigger for Bot Messages?** to **Yes**. + +![Screenshot of the Zapier UI, showing the correctly configured Message trigger step](/img/guides/orchestration/webhooks/zapier-slack/message-trigger-config.png) + +Test your Zap to find an example record. You might need to load additional samples until you get one that relates to a failed job, depending on whether you post all job events to Slack or not. + +### 2. Add a Filter step +Add a **Filter** step with the following conditions: +- **1. Text contains failed on Job** +- **1. User Is Bot Is true** +- **1. User Name Exactly matches dbt Cloud** + +![Screenshot of the Zapier UI, showing the correctly configured Filter step](/img/guides/orchestration/webhooks/zapier-slack/message-trigger-filter.png) + +### 3. Extract the run ID +Add a **Format** step with the **Event** of **Text**, and the Action **Extract Number**. For the **Input**, select **1. Text**. + +![Screenshot of the Zapier UI, showing the Transform step configured to extract a number from the Slack message's Text property](/img/guides/orchestration/webhooks/zapier-slack/extract-number.png) + +Test your step and validate that the run ID has been correctly extracted. + +### 4. Add a Delay +Sometimes dbt Cloud posts the message about the run failing before the run's artifacts are available through the API. For this reason, it's recommended to add a brief delay to increase the likelihood that the data is available. On certain plans, Zapier will automatically retry a job that fails from to a 404 error, but its standdown period is longer than is normally necessary so the context will be missing from your thread for longer. + +A one-minute delay is generally sufficient. + +### 5. Store secrets +In the next step, you will need either a dbt Cloud [user token](https://docs.getdbt.com/docs/dbt-cloud-apis/user-tokens) or [service account token](https://docs.getdbt.com/docs/dbt-cloud-apis/service-tokens). + +Zapier allows you to [store secrets](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps). This prevents your keys from being displayed as plaintext in the Zap code. You can access them with the [StoreClient utility](https://help.zapier.com/hc/en-us/articles/8496293969549-Store-data-from-code-steps-with-StoreClient). + +This guide assumes the name for the secret key is `DBT_CLOUD_SERVICE_TOKEN`. If you're using a different name, make sure you update all references to it in the sample code. + +This guide uses a short-lived code action to store the secrets, but you can also use a tool like Postman to interact with the [REST API](https://store.zapier.com/) or create a separate Zap and call the [Set Value Action](https://help.zapier.com/hc/en-us/articles/8496293271053-Save-and-retrieve-data-from-Zaps#3-set-a-value-in-your-store-0-3). + +#### a. Create a Storage by Zapier connection +If you haven't already got one, go to and create a new connection. Remember the UUID secret you generate for later. + +#### b. Add a temporary code step +Choose **Run Python** as the Event. Run the following code: +```python +store = StoreClient('abc123') #replace with your UUID secret +store.set('DBT_CLOUD_SERVICE_TOKEN', 'abc123') #replace with your dbt Cloud API token +``` +Test the step. You can delete this Action when the test succeeds. The key will remain stored as long as it is accessed at least once every three months. + +### 6. Add a Code action + +Select **Code by Zapier** as the App, and **Run Python** as the Event. + +This step is very similar to the one described in the main example, but you can skip a lot of the initial validation work. + +In the **Action** section, add two items to **Input Data**: `run_id` and `account_id`. Map those to the `3. Output` property and your hardcoded dbt Cloud Account ID, respectively. + +![Screenshot of the Zapier UI, showing the mappings of raw_body and auth_header](/img/guides/orchestration/webhooks/zapier-slack/code-example-alternate.png) + + +In the **Code** field, paste the following code, replacing `YOUR_SECRET_HERE` with the secret you created when setting up the Storage by Zapier integration. Remember that this is not your dbt Cloud secret. + +This example code extracts the run logs for the completed job from the Admin API, and then builds a message displaying any error messages extracted from the end-of-invocation logs created by dbt Core (which will be posted in a thread). + +```python +import re + +# Access secret credentials +secret_store = StoreClient('YOUR_SECRET_HERE') +api_token = secret_store.get('DBT_CLOUD_SERVICE_TOKEN') + +# Steps derived from these commands won't have their error details shown inline, as they're messy +commands_to_skip_logs = ['dbt source', 'dbt docs'] +run_id = input_data['run_id'] +account_id = input_data['account_id'] +url = f'https://cloud.getdbt.com/api/v2/accounts/{account_id}/runs/{run_id}/?include_related=["run_steps"]' +headers = {'Authorization': f'Token {api_token}'} + +response = requests.get(url, headers=headers) +response.raise_for_status() +results = response.json()['data'] + +threaded_errors_post = "" +for step in results['run_steps']: + show_logs = not any(cmd in step['name'] for cmd in commands_to_skip_logs) + if not show_logs: + continue + if step['status_humanized'] != 'Success': + full_log = step['logs'] + # Remove timestamp and any colour tags + full_log = re.sub('\x1b?\[[0-9]+m[0-9:]*', '', full_log) + + summary_start = re.search('(?:Completed with \d+ errors? and \d+ warnings?:|Database Error|Compilation Error|Runtime Error)', full_log) + + line_items = re.findall('(^.*(?:Failure|Error) in .*\n.*\n.*)', full_log, re.MULTILINE) + if not summary_start: + continue + + threaded_errors_post += f""" +*{step['name']}* +""" + # If there are no line items, the failure wasn't related to dbt nodes, and we want the whole rest of the message. + # If there are, then we just want the summary line and then to log out each individual node's error. + if len(line_items) == 0: + relevant_log = f'```{full_log[summary_start.start():]}```' + else: + relevant_log = summary_start[0] + for item in line_items: + relevant_log += f'\n```\n{item.strip()}\n```\n' + threaded_errors_post += f""" +{relevant_log} +""" + +output = {'threaded_errors_post': threaded_errors_post} +``` +### 7. Add Slack action in Zapier + +Add a **Send Channel Message in Slack** action. In the **Action** section, set the channel to **1. Channel Id**, which is the channel that the triggering message was posted in. + +Set the **Message Text** to **5. Threaded Errors Post** from the Run Python step. Set the **Thread** value to **1. Ts**, which is the timestamp of the triggering Slack post. This tells Zapier to add this post as a threaded reply to the main message, which prevents the full (potentially long) output from cluttering your channel. + +![Screenshot of the Zapier UI, showing the mappings of prior steps to a Slack message](/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config-alternate.png) + +### 8. Test and deploy + +When you're done testing your Zap, publish it. diff --git a/website/docs/quickstarts/bigquery-qs.md b/website/docs/quickstarts/bigquery-qs.md new file mode 100644 index 00000000000..e50b8aff26a --- /dev/null +++ b/website/docs/quickstarts/bigquery-qs.md @@ -0,0 +1,298 @@ +--- +title: "Quickstart for dbt Cloud and BigQuery" +id: "bigquery" +time_to_complete: '30 minutes' +platform: 'dbt-cloud' +icon: 'bigquery' +hide_table_of_contents: true +--- + +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with BigQuery. It will show you how to: + +- Create a Google Cloud Platform (GCP) project. +- Access sample data in a public dataset. +- Connect dbt Cloud to BigQuery. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) for free if you're interested in course learning with videos. + +::: + +### Prerequisites​ + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have a [Google account](https://support.google.com/accounts/answer/27441?hl=en). +- You can use a personal or work account to set up BigQuery through [Google Cloud Platform (GCP)](https://cloud.google.com/free). + +### Related content + +- Learn more with [dbt Courses](https://courses.getdbt.com/collections) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Create a new GCP project​ + +1. Go to the [BigQuery Console](https://console.cloud.google.com/bigquery) after you log in to your Google account. If you have multiple Google accounts, make sure you’re using the correct one. +2. Create a new project from the [Manage resources page](https://console.cloud.google.com/projectcreate?previousPage=%2Fcloud-resource-manager%3Fwalkthrough_id%3Dresource-manager--create-project%26project%3D%26folder%3D%26organizationId%3D%23step_index%3D1&walkthrough_id=resource-manager--create-project). For more information, refer to [Creating a project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#creating_a_project) in the Google Cloud docs. GCP automatically populates the Project name field for you. You can change it to be more descriptive for your use. For example, `dbt Learn - BigQuery Setup`. + +## Create BigQuery datasets + +1. From the [BigQuery Console](https://console.cloud.google.com/bigquery), click **Editor**. Make sure to select your newly created project, which is available at the top of the page. +1. Verify that you can run SQL queries. Copy and paste these queries into the Query Editor: + ```sql + select * from `dbt-tutorial.jaffle_shop.customers`; + select * from `dbt-tutorial.jaffle_shop.orders`; + select * from `dbt-tutorial.stripe.payment`; + ``` + + Click **Run**, then check for results from the queries. For example: +
          + +
          +2. Create new datasets from the [BigQuery Console](https://console.cloud.google.com/bigquery). For more information, refer to [Create datasets](https://cloud.google.com/bigquery/docs/datasets#create-dataset) in the Google Cloud docs. Datasets in BigQuery are equivalent to schemas in a traditional database. On the **Create dataset** page: + - **Dataset ID** — Enter a name that fits the purpose. This name is used like schema in fully qualified references to your database objects such as `database.schema.table`. As an example for this guide, create one for `jaffle_shop` and another one for `stripe` afterward. + - **Data location** — Leave it blank (the default). It determines the GCP location of where your data is stored. The current default location is the US multi-region. All tables within this dataset will share this location. + - **Enable table expiration** — Leave it unselected (the default). The default for the billing table expiration is 60 days. Because billing isn’t enabled for this project, GCP defaults to deprecating tables. + - **Google-managed encryption key** — This option is available under **Advanced options**. Allow Google to manage encryption (the default). +
          + +
          +3. After you create the `jaffle_shop` dataset, create one for `stripe` with all the same values except for **Dataset ID**. + +## Generate BigQuery credentials {#generate-bigquery-credentials} +In order to let dbt connect to your warehouse, you'll need to generate a keyfile. This is analogous to using a database username and password with most other data warehouses. + +1. Start the [GCP credentials wizard](https://console.cloud.google.com/apis/credentials/wizard). Make sure your new project is selected in the header. If you do not see your account or project, click your profile picture to the right and verify you are using the correct email account. For **Credential Type**: + - From the **Select an API** dropdown, choose **BigQuery API** + - Select **Application data** for the type of data you will be accessing + - Click **Next** to create a new service account. +2. Create a service account for your new project from the [Service accounts page](https://console.cloud.google.com/projectselector2/iam-admin/serviceaccounts?supportedpurview=project). For more information, refer to [Create a service account](https://developers.google.com/workspace/guides/create-credentials#create_a_service_account) in the Google Cloud docs. As an example for this guide, you can: + - Type `dbt-user` as the **Service account name** + - From the **Select a role** dropdown, choose **BigQuery Admin** and click **Continue** + - Leave the **Grant users access to this service account** fields blank + - Click **Done** +3. Create a service account key for your new project from the [Service accounts page](https://console.cloud.google.com/iam-admin/serviceaccounts?walkthrough_id=iam--create-service-account-keys&start_index=1#step_index=1). For more information, refer to [Create a service account key](https://cloud.google.com/iam/docs/creating-managing-service-account-keys#creating) in the Google Cloud docs. When downloading the JSON file, make sure to use a filename you can easily remember. For example, `dbt-user-creds.json`. For security reasons, dbt Labs recommends that you protect this JSON file like you would your identity credentials; for example, don't check the JSON file into your version control software. + +## Connect dbt Cloud to BigQuery​ +1. Create a new project in [dbt Cloud](https://cloud.getdbt.com/). From **Account settings** (using the gear menu in the top right corner), click **+ New Project**. +2. Enter a project name and click **Continue**. +3. For the warehouse, click **BigQuery** then **Next** to set up your connection. +4. Click **Upload a Service Account JSON File** in settings. +5. Select the JSON file you downloaded in [Generate BigQuery credentials](#generate-bigquery-credentials) and dbt Cloud will fill in all the necessary fields. +6. Click **Test Connection**. This verifies that dbt Cloud can access your BigQuery account. +7. Click **Next** if the test succeeded. If it failed, you might need to go back and regenerate your BigQuery credentials. + + +## Set up a dbt Cloud managed repository + + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **+ Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from `dbt-tutorial.jaffle_shop.customers` + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +3. Click the **...** next to the `models` directory, then select **Create file**. +4. Name the file `customers.sql`, then click **Create**. +5. Copy the following query into the file and click **Save**. + +```sql +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from `dbt-tutorial`.jaffle_shop.customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from `dbt-tutorial`.jaffle_shop.orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final +``` + +6. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +#### FAQs + + + + + + + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from `dbt-tutorial`.jaffle_shop.customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from `dbt-tutorial`.jaffle_shop.orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + + + + + + + diff --git a/website/docs/quickstarts/codespace-qs.md b/website/docs/quickstarts/codespace-qs.md new file mode 100644 index 00000000000..3cd048c97a4 --- /dev/null +++ b/website/docs/quickstarts/codespace-qs.md @@ -0,0 +1,72 @@ +--- +title: "Quickstart for dbt Core using GitHub Codespaces" +id: codespace +platform: 'dbt-core' +icon: 'fa-github' +hide_table_of_contents: true +--- + +## Introduction + +In this quickstart guide, you’ll learn how to create a codespace and be able to execute the `dbt build` command from it in _less than 5 minutes_. + +dbt Labs provides a [GitHub Codespace](https://docs.github.com/en/codespaces/overview) template that you (and anyone else) can reuse to create a complete dbt environment with a working and runnable project. When you create the codespace, the [dev container](https://docs.github.com/en/codespaces/setting-up-your-project-for-codespaces/adding-a-dev-container-configuration/introduction-to-dev-containers) creates a fully functioning dbt environment, connects to a DuckDB database, and loads a year of data from our fictional Jaffle Shop café, which sells food and beverages in several US cities. The [README](https://github.com/dbt-labs/jaffle-shop-template#readme) for the Jaffle Shop template also provides instructions on how to do this, along with animated GIFs. + +### Prerequisites + +- To use the dbt command-line interface (CLI), it's important that you know some basics of the terminal. In particular, you should understand `cd`, `ls` , and `pwd` to navigate through the directory structure of your computer easily. +- You have a [GitHub account](https://github.com/join). + +## Related content + +- [Create a GitHub repository](/quickstarts/manual-install?step=2) +- [Build your first models](/quickstarts/manual-install?step=3) +- [Test and document your project](/quickstarts/manual-install?step=4) +- [Schedule a job](/quickstarts/manual-install?step=5) +- Learn more with [dbt Courses](https://courses.getdbt.com/collections) + +## Create a codespace + +1. Go to the `jaffle-shop-template` [repository](https://github.com/dbt-labs/jaffle-shop-template) after you log in to your GitHub account. +1. Click **Use this template** at the top of the page and choose **Create new repository**. +1. Click **Create repository from template** when you’re done setting the options for your new repository. +1. Click **Code** (at the top of the new repository’s page). Under the **Codespaces** tab, choose **Create codespace on main**. Depending on how you've configured your computer's settings, this either opens a new browser tab with the Codespace development environment with VSCode running in it or opens a new VSCode window with the codespace in it. +1. Wait for the codespace to finish building by waiting for the `postCreateCommand` command to complete; this can take several minutes: + + + + When this command completes, you can start using the codespace development environment. The terminal the command ran in will close and you will get a prompt in a brand new terminal. + +1. At the terminal's prompt, you can execute any dbt command you want. For example: + + ```shell + /workspaces/test (main) $ dbt build + ``` + + You can also use the [duckcli](https://github.com/dbcli/duckcli) to write SQL against the warehouse from the command line or build reports in the [Evidence](https://evidence.dev/) project provided in the `reports` directory. + + For complete information, refer to the [dbt command reference](https://docs.getdbt.com/reference/dbt-commands). Common commands are: + + - [dbt compile](https://docs.getdbt.com/reference/commands/compile) — generates executable SQL from your project source files + - [dbt run](https://docs.getdbt.com/reference/commands/run) — compiles and runs your project + - [dbt test](https://docs.getdbt.com/reference/commands/test) — compiles and tests your project + - [dbt build](https://docs.getdbt.com/reference/commands/build) — compiles, runs, and tests your project + +## Generate a larger data set + +If you'd like to work with a larger selection of Jaffle Shop data, you can generate an arbitrary number of years of fictitious data from within your codespace. + +1. Install the Python package called [jafgen](https://pypi.org/project/jafgen/). At the terminal's prompt, run: + + ```shell + /workspaces/test (main) $ pip install jafgen + ``` + +1. When installation is done, run: + ```shell + /workspaces/test (main) $ jafgen --years NUMBER_OF_YEARS + ``` + Replace `NUMBER_OF_YEARS` with the number of years you want to simulate. This command builds the CSV files and stores them in the `jaffle-data` folder, and is automatically sourced based on the `sources.yml` file and the [dbt-duckdb](/docs/core/connect-data-platform/duckdb-setup) adapter. + +As you increase the number of years, it takes exponentially more time to generate the data because the Jaffle Shop stores grow in size and number. For a good balance of data size and time to build, dbt Labs suggests a maximum of 6 years. + diff --git a/website/docs/quickstarts/databricks-qs.md b/website/docs/quickstarts/databricks-qs.md new file mode 100644 index 00000000000..08334862517 --- /dev/null +++ b/website/docs/quickstarts/databricks-qs.md @@ -0,0 +1,375 @@ +--- +title: "Quickstart for dbt Cloud and Databricks" +id: "databricks" +platform: 'dbt-cloud' +icon: 'databricks' +hide_table_of_contents: true +--- +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Databricks. It will show you how to: + +- Create a Databricks workspace. +- Load sample data into your Databricks account. +- Connect dbt Cloud to Databricks. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) for free if you're interested in course learning with videos. + +::: + +### Prerequisites​ + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have an account with a cloud service provider (such as AWS, GCP, and Azure) and have permissions to create an S3 bucket with this account. For demonstrative purposes, this guide uses AWS as the cloud service provider. + +### Related content + +- Learn more with [dbt Courses](https://courses.getdbt.com/collections) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Create a Databricks workspace + +1. Use your existing account or sign up for a Databricks account at [Try Databricks](https://databricks.com/). Complete the form with your user information. + +
          + +
          + +2. For the purpose of this tutorial, you will be selecting AWS as our cloud provider but if you use Azure or GCP internally, please choose one of them. The setup process will be similar. +3. Check your email to complete the verification process. +4. After setting up your password, you will be guided to choose a subscription plan. Select the `Premium` or `Enterprise` plan to access the SQL Compute functionality required for using the SQL warehouse for dbt. We have chosen `Premium` for this tutorial. Click **Continue** after selecting your plan. + +
          + +
          + +5. Click **Get Started** when you come to this below page and then **Confirm** after you validate that you have everything needed. + +
          + +
          +
          + +
          + +6. Now it's time to create your first workspace. A Databricks workspace is an environment for accessing all of your Databricks assets. The workspace organizes objects like notebooks, SQL warehouses, clusters, etc into one place. Provide the name of your workspace and choose the appropriate AWS region and click **Start Quickstart**. You might get the checkbox of **I have data in S3 that I want to query with Databricks**. You do not need to check this off for the purpose of this tutorial. + +
          + +
          + +7. By clicking on `Start Quickstart`, you will be redirected to AWS and asked to log in if you haven’t already. After logging in, you should see a page similar to this. + +
          + +
          + +:::tip +If you get a session error and don’t get redirected to this page, you can go back to the Databricks UI and create a workspace from the interface. All you have to do is click **create workspaces**, choose the quickstart, fill out the form and click **Start Quickstart**. +::: + +8. There is no need to change any of the pre-filled out fields in the Parameters. Just add in your Databricks password under **Databricks Account Credentials**. Check off the Acknowledgement and click **Create stack**. +
          + +
          + +
          + +
          + +10. Go back to the Databricks tab. You should see that your workspace is ready to use. +
          + +
          +11. Now let’s jump into the workspace. Click **Open** and log into the workspace using the same login as you used to log into the account. + +## Load data + +1. Download these CSV files (the Jaffle Shop sample data) that you will need for this guide: + - [jaffle_shop_customers.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_customers.csv) + - [jaffle_shop_orders.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_orders.csv) + - [stripe_payments.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/stripe_payments.csv) + +2. First we need a SQL warehouse. Find the drop down menu and toggle into the SQL space. +
          + +
          +3. We will be setting up a SQL warehouse now. Select **SQL Warehouses** from the left hand side console. You will see that a default SQL Warehouse exists. + +4. Click **Start** on the Starter Warehouse. This will take a few minutes to get the necessary resources spun up. + +5. Once the SQL Warehouse is up, click **New** and then **File upload** on the dropdown menu. +
          + +
          + +6. Let's load the Jaffle Shop Customers data first. Drop in the `jaffle_shop_customers.csv` file into the UI. +
          + +
          + +7. Update the Table Attributes at the top: + + - data_catalog = hive_metastore + - database = default + - table = jaffle_shop_customers + - Make sure that the column data types are correct. The way you can do this is by hovering over the datatype icon next to the column name. + - ID = bigint + - FIRST_NAME = string + - LAST_NAME = string + +
          + +
          + +8. Click **Create** on the bottom once you’re done. + +9. Now let’s do the same for `Jaffle Shop Orders` and `Stripe Payments`. + +
          + +
          + +
          + +
          + +10. Once that's done, make sure you can query the training data. Navigate to the `SQL Editor` through the left hand menu. This will bring you to a query editor. +11. Ensure that you can run a `select *` from each of the tables with the following code snippets. + + ```sql + select * from default.jaffle_shop_customers + select * from default.jaffle_shop_orders + select * from default.stripe_payments + ``` + +
          + +
          + +12. To ensure any users who might be working on your dbt project has access to your object, run this command. + + ```sql + grant all privileges on schema default to users; + ``` + +## Connect dbt Cloud to Databricks + +There are two ways to connect dbt Cloud to Databricks. The first option is Partner Connect, which provides a streamlined setup to create your dbt Cloud account from within your new Databricks trial account. The second option is to create your dbt Cloud account separately and build the Databricks connection yourself (connect manually). If you want to get started quickly, dbt Labs recommends using Partner Connect. If you want to customize your setup from the very beginning and gain familiarity with the dbt Cloud setup flow, dbt Labs recommends connecting manually. + +If you want to use Partner Connect, refer to [Connect to dbt Cloud using Partner Connect](https://docs.databricks.com/partners/prep/dbt-cloud.html#connect-to-dbt-cloud-using-partner-connect) in the Databricks docs for instructions. + +If you want to connect manually, refer to [Connect to dbt Cloud manually](https://docs.databricks.com/partners/prep/dbt-cloud.html#connect-to-dbt-cloud-manually) in the Databricks docs for instructions. + +## Set up a dbt Cloud managed repository +If you used Partner Connect, you can skip to [initializing your dbt project](#initialize-your-dbt-project-and-start-developing) as the Partner Connect provides you with a managed repository. Otherwise, you will need to create your repository connection. + + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **+ Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from default.jaffle_shop_customers + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +3. Click the **...** next to the `models` directory, then select **Create file**. +4. Name the file `customers.sql`, then click **Create**. +5. Copy the following query into the file and click **Save**. + +```sql +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from jaffle_shop_customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop_orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final +``` + +6. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +#### FAQs + + + + + + + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from jaffle_shop_customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop_orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + + + + + diff --git a/website/docs/quickstarts/manual-install-qs.md b/website/docs/quickstarts/manual-install-qs.md new file mode 100644 index 00000000000..2444cf29d7e --- /dev/null +++ b/website/docs/quickstarts/manual-install-qs.md @@ -0,0 +1,467 @@ +--- +title: "Quickstart for dbt Core from a manual install" +id: manual-install +description: "Connecting your warehouse to dbt Core using the CLI." +sidebar_label: "Manual install quickstart" +platform: 'dbt-core' +icon: 'fa-light fa-square-terminal' +hide_table_of_contents: true +--- +## Introduction + +When you use dbt Core to work with dbt, you will be editing files locally using a code editor, and running projects using a command line interface (CLI). If you'd rather edit files and run projects using the web-based Integrated Development Environment (IDE), you should refer to the [dbt Cloud quickstarts](/quickstarts). You can also develop and run dbt commands using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) — a dbt Cloud powered command line. + +### Prerequisites + +* To use dbt Core, it's important that you know some basics of the Terminal. In particular, you should understand `cd`, `ls` and `pwd` to navigate through the directory structure of your computer easily. +* Install dbt Core using the [installation instructions](/docs/core/installation) for your operating system. +* Complete [Setting up (in BigQuery)](/quickstarts/bigquery?step=2) and [Loading data (BigQuery)](/quickstarts/bigquery?step=3). +* [Create a GitHub account](https://github.com/join) if you don't already have one. + +### Create a starter project + +After setting up BigQuery to work with dbt, you are ready to create a starter project with example models, before building your own models. + +## Create a repository + +The following steps use [GitHub](https://github.com/) as the Git provider for this guide, but you can use any Git provider. You should have already [created a GitHub account](https://github.com/join). + +1. [Create a new GitHub repository](https://github.com/new) named `dbt-tutorial`. +2. Select **Public** so the repository can be shared with others. You can always make it private later. +3. Leave the default values for all other settings. +4. Click **Create repository**. +5. Save the commands from "…or create a new repository on the command line" to use later in [Commit your changes](#commit-your-changes). + +## Create a project + +Learn how to use a series of commands using the command line of the Terminal to create your project. dbt Core includes an `init` command that helps scaffold a dbt project. + +To create your dbt project: + +1. Make sure you have dbt Core installed and check the version using the `dbt --version` command: + +```shell +dbt --version +``` + +2. Initiate the `jaffle_shop` project using the `init` command: + +```shell +dbt init jaffle_shop +``` + +3. Navigate into your project's directory: + +```shell +cd jaffle_shop +``` + +4. Use `pwd` to confirm that you are in the right spot: + +```shell +$ pwd +> Users/BBaggins/dbt-tutorial/jaffle_shop +``` + +5. Use a code editor like Atom or VSCode to open the project directory you created in the previous steps, which we named jaffle_shop. The content includes folders and `.sql` and `.yml` files generated by the `init` command. + +
          + +
          + +6. Update the following values in the `dbt_project.yml` file: + + + +```yaml +name: jaffle_shop # Change from the default, `my_new_project` + +... + +profile: jaffle_shop # Change from the default profile name, `default` + +... + +models: + jaffle_shop: # Change from `my_new_project` to match the previous value for `name:` + ... +``` + + + +## Connect to BigQuery + +When developing locally, dbt connects to your using a [profile](/docs/core/connect-data-platform/connection-profiles), which is a YAML file with all the connection details to your warehouse. + +1. Create a file in the `~/.dbt/` directory named `profiles.yml`. +2. Move your BigQuery keyfile into this directory. +3. Copy the following and paste into the new profiles.yml file. Make sure you update the values where noted. + + + +```yaml +jaffle_shop: # this needs to match the profile in your dbt_project.yml file + target: dev + outputs: + dev: + type: bigquery + method: service-account + keyfile: /Users/BBaggins/.dbt/dbt-tutorial-project-331118.json # replace this with the full path to your keyfile + project: grand-highway-265418 # Replace this with your project id + dataset: dbt_bbagins # Replace this with dbt_your_name, e.g. dbt_bilbo + threads: 1 + timeout_seconds: 300 + location: US + priority: interactive +``` + + + +4. Run the `debug` command from your project to confirm that you can successfully connect: + +```shell +$ dbt debug +> Connection test: OK connection ok +``` + +
          + +
          + +### FAQs + + + + + + + +## Perform your first dbt run + +Our sample project has some example models in it. We're going to check that we can run them to confirm everything is in order. + +1. Enter the `run` command to build example models: + +```shell +dbt run +``` + +You should have an output that looks like this: + +
          + +
          + +## Commit your changes + +Commit your changes so that the repository contains the latest code. + +1. Link the GitHub repository you created to your dbt project by running the following commands in Terminal. Make sure you use the correct git URL for your repository, which you should have saved from step 5 in [Create a repository](#create-a-repository). + +```shell +git init +git branch -M main +git add . +git commit -m "Create a dbt project" +git remote add origin https://github.com/USERNAME/dbt-tutorial.git +git push -u origin main +``` + +2. Return to your GitHub repository to verify your new files have been added. + +### Build your first models + +Now that you set up your sample project, you can get to the fun part — [building models](/docs/build/sql-models)! +In the next steps, you will take a sample query and turn it into a model in your dbt project. + +## Checkout a new git branch + +Check out a new git branch to work on new code: + +1. Create a new branch by using the `checkout` command and passing the `-b` flag: + +```shell +$ git checkout -b add-customers-model +> Switched to a new branch `add-customer-model` +``` + +## Build your first model + + +1. Open your project in your favorite code editor. +2. Create a new SQL file in the `models` directory, named `models/customers.sql`. +3. Paste the following query into the `models/customers.sql` file. + + + +4. From the command line, enter `dbt run`. +
          + +
          + +When you return to the BigQuery console, you can `select` from this model. + +### FAQs + + + + + + + +## Change the way your model is materialized + + + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + +
          + + + +```sql +select + id as customer_id, + first_name, + last_name + +from `dbt-tutorial`.jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from `dbt-tutorial`.jaffle_shop.orders +``` + + + +
          + +
          + + + +```sql +select + id as customer_id, + first_name, + last_name + +from jaffle_shop_customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from jaffle_shop_orders +``` + + + +
          + +
          + + + +```sql +select + id as customer_id, + first_name, + last_name + +from jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from jaffle_shop.orders +``` + + + +
          + +
          + + + +```sql +select + id as customer_id, + first_name, + last_name + +from raw.jaffle_shop.customers +``` + + + + + +```sql +select + id as order_id, + user_id as customer_id, + order_date, + status + +from raw.jaffle_shop.orders +``` + + + +
          + +
          + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + +```sql +with customers as ( + + select * from {{ ref('stg_customers') }} + +), + +orders as ( + + select * from {{ ref('stg_orders') }} + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final + +``` + + + +4. Execute `dbt run`. + +This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +### FAQs {#faq-2} + + + + + +### Next steps + + + +You can also explore: + +* The `target` directory to see all of the compiled SQL. The `run` directory shows the create or replace table statements that are running, which are the select statements wrapped in the correct DDL. +* The `logs` file to see how dbt Core logs all of the action happening within your project. It shows the select statements that are running and the python logging happening when dbt runs. + +## Add tests to your models + + + +## Document your models + + + +3. Run `dbt docs serve` command to launch the documentation in a local website. + +#### FAQs + + + + + +#### Next steps + + + +## Commit updated changes + +You need to commit the changes you made to the project so that the repository has your latest code. + +1. Add all your changes to git: `git add -A` +2. Commit your changes: `git commit -m "Add customers model, tests, docs"` +3. Push your changes to your repository: `git push` +4. Navigate to your repository, and open a pull request to merge the code into your master branch. + +## Schedule a job + +We recommend using dbt Cloud as the easiest and most reliable way to [deploy jobs](/docs/deploy/deployments) and automate your dbt project in production. + +For more info on how to get started, refer to [create and schedule jobs](/docs/deploy/deploy-jobs#create-and-schedule-jobs). + + + +For more information about using dbt Core to schedule a job, refer [dbt airflow](/blog/dbt-airflow-spiritual-alignment) blog post. diff --git a/website/docs/quickstarts/redshift-qs.md b/website/docs/quickstarts/redshift-qs.md new file mode 100644 index 00000000000..67f66d6e275 --- /dev/null +++ b/website/docs/quickstarts/redshift-qs.md @@ -0,0 +1,391 @@ +--- +title: "Quickstart for dbt Cloud and Redshift" +id: "redshift" +platform: 'dbt-cloud' +icon: 'redshift' +hide_table_of_contents: true +--- +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Redshift. It will show you how to: + +- Set up a Redshift cluster. +- Load sample data into your Redshift account. +- Connect dbt Cloud to Redshift. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models +- Document your models +- Schedule a job to run + + +:::tip Videos for you +You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) for free if you're interested in course learning with videos. + +::: + +### Prerequisites + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have an AWS account with permissions to execute a CloudFormation template to create appropriate roles and a Redshift cluster. + +### Related content + +- Learn more with [dbt Courses](https://courses.getdbt.com/collections) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + + +## Create a Redshift cluster +1. Sign in to your [AWS account](https://signin.aws.amazon.com/console) as a root user or an IAM user depending on your level of access. +2. Use a CloudFormation template to quickly set up a Redshift cluster. A CloudFormation template is a configuration file that automatically spins up the necessary resources in AWS. [Start a CloudFormation stack](https://console.aws.amazon.com/cloudformation/home?region=us-east-1#/stacks/new?stackName=dbt-workshop&templateURL=https://tpch-sample-data.s3.amazonaws.com/create-dbtworkshop-infr) and you can refer to the [create-dbtworkshop-infr JSON file](https://github.com/aws-samples/aws-modernization-with-dbtlabs/blob/main/resources/cloudformation/create-dbtworkshop-infr) for more template details. + + +3. Click **Next** for each page until you reach the **Select acknowledgement** checkbox. Select **I acknowledge that AWS CloudFormation might create IAM resources with custom names** and click **Create Stack**. You should land on the stack page with a CREATE_IN_PROGRESS status. + + + +4. When the stack status changes to CREATE_COMPLETE, click the **Outputs** tab on the top to view information that you will use throughout the rest of this guide. Save those credentials for later by keeping this open in a tab. + +5. Type `Redshift` in the search bar at the top and click **Amazon Redshift**. + + + +6. Confirm that your new Redshift cluster is listed in **Cluster overview**. Select your new cluster. The cluster name should begin with `dbtredshiftcluster-`. Then, click **Query Data**. You can choose the classic query editor or v2. We will be using the v2 version for the purpose of this guide. + + + +7. You might be asked to Configure account. For the purpose of this sandbox environment, we recommend selecting “Configure account”. + +8. Select your cluster from the list. In the **Connect to** popup, fill out the credentials from the output of the stack: + - **Authentication** — Use the default which is **Database user name and password** (NOTE: IAM authentication is not supported in dbt Cloud). + - **Database** — `dbtworkshop` + - **User name** — `dbtadmin` + - **Password** — Use the autogenerated `RSadminpassword` from the output of the stack and save it for later. + + + + + +9. Click **Create connection**. + +## Load data + +Now we are going to load our sample data into the S3 bucket that our Cloudformation template created. S3 buckets are simple and inexpensive way to store data outside of Redshift. + +1. The data used in this course is stored as CSVs in a public S3 bucket. You can use the following URLs to download these files. Download these to your computer to use in the following steps. + - [jaffle_shop_customers.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_customers.csv) + - [jaffle_shop_orders.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_orders.csv) + - [stripe_payments.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/stripe_payments.csv) + +2. Now we are going to use the S3 bucket that you created with CloudFormation and upload the files. Go to the search bar at the top and type in `S3` and click on S3. There will be sample data in the bucket already, feel free to ignore it or use it for other modeling exploration. The bucket will be prefixed with `dbt-data-lake`. + + + + +3. Click on the `name of the bucket` S3 bucket. If you have multiple S3 buckets, this will be the bucket that was listed under “Workshopbucket” on the Outputs page. + + + +4. Click **Upload**. Drag the three files into the UI and click the **Upload** button. + + + +5. Remember the name of the S3 bucket for later. It should look like this: `s3://dbt-data-lake-xxxx`. You will need it for the next section. +6. Now let’s go back to the Redshift query editor. Search for Redshift in the search bar, choose your cluster, and select Query data. +7. In your query editor, execute this query below to create the schemas that we will be placing your raw data into. You can highlight the statement and then click on Run to run them individually. If you are on the Classic Query Editor, you might need to input them separately into the UI. You should see these schemas listed under `dbtworkshop`. + + ```sql + create schema if not exists jaffle_shop; + create schema if not exists stripe; + ``` + +8. Now create the tables in your schema with these queries using the statements below. These will be populated as tables in the respective schemas. + + ```sql + create table jaffle_shop.customers( + id integer, + first_name varchar(50), + last_name varchar(50) + ); + + create table jaffle_shop.orders( + id integer, + user_id integer, + order_date date, + status varchar(50), + _etl_loaded_at timestamp default current_timestamp + ); + + create table stripe.payment( + id integer, + orderid integer, + paymentmethod varchar(50), + status varchar(50), + amount integer, + created date, + _batched_at timestamp default current_timestamp + ); + ``` + +9. Now we need to copy the data from S3. This enables you to run queries in this guide for demonstrative purposes; it's not an example of how you would do this for a real project. Make sure to update the S3 location, iam role, and region. You can find the S3 and iam role in your outputs from the CloudFormation stack. Find the stack by searching for `CloudFormation` in the search bar, then clicking **Stacks** in the CloudFormation tile. + + ```sql + copy jaffle_shop.customers( id, first_name, last_name) + from 's3://dbt-data-lake-xxxx/jaffle_shop_customers.csv' + iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' + region 'us-east-1' + delimiter ',' + ignoreheader 1 + acceptinvchars; + + copy jaffle_shop.orders(id, user_id, order_date, status) + from 's3://dbt-data-lake-xxxx/jaffle_shop_orders.csv' + iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' + region 'us-east-1' + delimiter ',' + ignoreheader 1 + acceptinvchars; + + copy stripe.payment(id, orderid, paymentmethod, status, amount, created) + from 's3://dbt-data-lake-xxxx/stripe_payments.csv' + iam_role 'arn:aws:iam::XXXXXXXXXX:role/RoleName' + region 'us-east-1' + delimiter ',' + ignoreheader 1 + Acceptinvchars; + ``` + + Ensure that you can run a `select *` from each of the tables with the following code snippets. + + ```sql + select * from jaffle_shop.customers; + select * from jaffle_shop.orders; + select * from stripe.payment; + ``` +## Connect dbt Cloud to Redshift +1. Create a new project in [dbt Cloud](https://cloud.getdbt.com/). From **Account settings** (using the gear menu in the top right corner), click **+ New Project**. +2. Enter a project name and click **Continue**. +3. For the warehouse, click **Redshift** then **Next** to set up your connection. +4. Enter your Redshift settings. Reference your credentials you saved from the CloudFormation template. + - **Hostname** — Your entire hostname. + - **Port** — `5439` + - **Database** — `dbtworkshop`. +
          + +
          + +5. Set your development credentials. These credentials will be used by dbt Cloud to connect to Redshift. Those credentials (as provided in your CloudFormation output) will be: + - **Username** — `dbtadmin` + - **Password** — This is the autogenerated password that you used earlier in the guide + - **Schema** — dbt Cloud automatically generates a schema name for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. +
          + +
          + +6. Click **Test Connection**. This verifies that dbt Cloud can access your Redshift cluster. +7. Click **Next** if the test succeeded. If it failed, you might need to check your Redshift settings and credentials. + +## Set up a dbt Cloud managed repository + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **+ Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from jaffle_shop.customers + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +3. Click the **...** next to the `models` directory, then select **Create file**. +4. Name the file `customers.sql`, then click **Create**. +5. Copy the following query into the file and click **Save**. + +```sql +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final +``` + +6. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +#### FAQs + + + + + + + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from jaffle_shop.customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from jaffle_shop.orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + + + + + diff --git a/website/docs/quickstarts/snowflake-qs.md b/website/docs/quickstarts/snowflake-qs.md new file mode 100644 index 00000000000..33e253e8c15 --- /dev/null +++ b/website/docs/quickstarts/snowflake-qs.md @@ -0,0 +1,471 @@ +--- +title: "Quickstart for dbt Cloud and Snowflake" +id: "snowflake" +platform: 'dbt-cloud' +icon: 'snowflake' +hide_table_of_contents: true +--- +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with Snowflake. It will show you how to: + +- Create a new Snowflake worksheet. +- Load sample data into your Snowflake account. +- Connect dbt Cloud to Snowflake. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add sources to your dbt project. Sources allow you to name and describe the raw data already loaded into Snowflake. +- Add tests to your models. +- Document your models. +- Schedule a job to run. + +Snowflake also provides a quickstart for you to learn how to use dbt Cloud. It makes use of a different public dataset (Knoema Economy Data Atlas) than what's shown in this guide. For more information, refer to [Accelerating Data Teams with dbt Cloud & Snowflake](https://quickstarts.snowflake.com/guide/accelerating_data_teams_with_snowflake_and_dbt_cloud_hands_on_lab/) in the Snowflake docs. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) for free if you're interested in course learning with videos. + +You can also watch the [YouTube video on dbt and Snowflake](https://www.youtube.com/watch?v=kbCkwhySV_I&list=PL0QYlrC86xQm7CoOH6RS7hcgLnd3OQioG). +::: + +### Prerequisites​ + +- You have a [dbt Cloud account](https://www.getdbt.com/signup/). +- You have a [trial Snowflake account](https://signup.snowflake.com/). During trial account creation, make sure to choose the **Enterprise** Snowflake edition so you have `ACCOUNTADMIN` access. For a full implementation, you should consider organizational questions when choosing a cloud provider. For more information, see [Introduction to Cloud Platforms](https://docs.snowflake.com/en/user-guide/intro-cloud-platforms.html) in the Snowflake docs. For the purposes of this setup, all cloud providers and regions will work so choose whichever you’d like. + +### Related content + +- Learn more with [dbt Courses](https://courses.getdbt.com/collections) +- [How we configure Snowflake](https://blog.getdbt.com/how-we-configure-snowflake/) +- [CI jobs](/docs/deploy/continuous-integration) +- [Deploy jobs](/docs/deploy/deploy-jobs) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) + +## Create a new Snowflake worksheet +1. Log in to your trial Snowflake account. +2. In the Snowflake UI, click **+ Worksheet** in the upper right corner to create a new worksheet. + +## Load data +The data used here is stored as CSV files in a public S3 bucket and the following steps will guide you through how to prepare your Snowflake account for that data and upload it. + +1. Create a new virtual warehouse, two new databases (one for raw data, the other for future dbt development), and two new schemas (one for `jaffle_shop` data, the other for `stripe` data). + + To do this, run these SQL commands by typing them into the Editor of your new Snowflake worksheet and clicking **Run** in the upper right corner of the UI: + ```sql + create warehouse transforming; + create database raw; + create database analytics; + create schema raw.jaffle_shop; + create schema raw.stripe; + ``` + +2. In the `raw` database and `jaffle_shop` and `stripe` schemas, create three tables and load relevant data into them: + + - First, delete all contents (empty) in the Editor of the Snowflake worksheet. Then, run this SQL command to create the `customer` table: + + ```sql + create table raw.jaffle_shop.customers + ( id integer, + first_name varchar, + last_name varchar + ); + ``` + + - Delete all contents in the Editor, then run this command to load data into the `customer` table: + + ```sql + copy into raw.jaffle_shop.customers (id, first_name, last_name) + from 's3://dbt-tutorial-public/jaffle_shop_customers.csv' + file_format = ( + type = 'CSV' + field_delimiter = ',' + skip_header = 1 + ); + ``` + - Delete all contents in the Editor (empty), then run this command to create the `orders` table: + ```sql + create table raw.jaffle_shop.orders + ( id integer, + user_id integer, + order_date date, + status varchar, + _etl_loaded_at timestamp default current_timestamp + ); + ``` + + - Delete all contents in the Editor, then run this command to load data into the `orders` table: + ```sql + copy into raw.jaffle_shop.orders (id, user_id, order_date, status) + from 's3://dbt-tutorial-public/jaffle_shop_orders.csv' + file_format = ( + type = 'CSV' + field_delimiter = ',' + skip_header = 1 + ); + ``` + - Delete all contents in the Editor (empty), then run this command to create the `payment` table: + ```sql + create table raw.stripe.payment + ( id integer, + orderid integer, + paymentmethod varchar, + status varchar, + amount integer, + created date, + _batched_at timestamp default current_timestamp + ); + ``` + - Delete all contents in the Editor, then run this command to load data into the `payment` table: + ```sql + copy into raw.stripe.payment (id, orderid, paymentmethod, status, amount, created) + from 's3://dbt-tutorial-public/stripe_payments.csv' + file_format = ( + type = 'CSV' + field_delimiter = ',' + skip_header = 1 + ); + ``` +3. Verify that the data is loaded by running these SQL queries. Confirm that you can see output for each one. + ```sql + select * from raw.jaffle_shop.customers; + select * from raw.jaffle_shop.orders; + select * from raw.stripe.payment; + ``` + +## Connect dbt Cloud to Snowflake + +There are two ways to connect dbt Cloud to Snowflake. The first option is Partner Connect, which provides a streamlined setup to create your dbt Cloud account from within your new Snowflake trial account. The second option is to create your dbt Cloud account separately and build the Snowflake connection yourself (connect manually). If you want to get started quickly, dbt Labs recommends using Partner Connect. If you want to customize your setup from the very beginning and gain familiarity with the dbt Cloud setup flow, dbt Labs recommends connecting manually. + + + + +Using Partner Connect allows you to create a complete dbt account with your [Snowflake connection](/docs/cloud/connect-data-platform/connect-snowflake), [a managed repository](/docs/collaborate/git/managed-repository), [environments](/docs/build/custom-schemas#managing-environments), and credentials. + +1. In the Snowflake UI, click on the home icon in the upper left corner. In the left sidebar, select **Admin**. Then, select **Partner Connect**. Find the dbt tile by scrolling or by searching for dbt in the search bar. Click the tile to connect to dbt. + + + + If you’re using the classic version of the Snowflake UI, you can click the **Partner Connect** button in the top bar of your account. From there, click on the dbt tile to open up the connect box. + + + +2. In the **Connect to dbt** popup, find the **Optional Grant** option and select the **RAW** and **ANALYTICS** databases. This will grant access for your new dbt user role to each database. Then, click **Connect**. + + + + + +3. Click **Activate** when a popup appears: + + + + + +4. After the new tab loads, you will see a form. If you already created a dbt Cloud account, you will be asked to provide an account name. If you haven't created account, you will be asked to provide an account name and password. + + + +5. After you have filled out the form and clicked **Complete Registration**, you will be logged into dbt Cloud automatically. + +6. From your **Account Settings** in dbt Cloud (using the gear menu in the upper right corner), choose the "Partner Connect Trial" project and select **snowflake** in the overview table. Select edit and update the fields **Database** and **Warehouse** to be `analytics` and `transforming`, respectively. + + + + + + + + + +1. Create a new project in dbt Cloud. From **Account settings** (using the gear menu in the top right corner), click **+ New Project**. +2. Enter a project name and click **Continue**. +3. For the warehouse, click **Snowflake** then **Next** to set up your connection. + + + +4. Enter your **Settings** for Snowflake with: + * **Account** — Find your account by using the Snowflake trial account URL and removing `snowflakecomputing.com`. The order of your account information will vary by Snowflake version. For example, Snowflake's Classic console URL might look like: `oq65696.west-us-2.azure.snowflakecomputing.com`. The AppUI or Snowsight URL might look more like: `snowflakecomputing.com/west-us-2.azure/oq65696`. In both examples, your account will be: `oq65696.west-us-2.azure`. For more information, see [Account Identifiers](https://docs.snowflake.com/en/user-guide/admin-account-identifier.html) in the Snowflake docs. + + + + * **Role** — Leave blank for now. You can update this to a default Snowflake role later. + * **Database** — `analytics`. This tells dbt to create new models in the analytics database. + * **Warehouse** — `transforming`. This tells dbt to use the transforming warehouse that was created earlier. + + + +5. Enter your **Development Credentials** for Snowflake with: + * **Username** — The username you created for Snowflake. The username is not your email address and is usually your first and last name together in one word. + * **Password** — The password you set when creating your Snowflake account. + * **Schema** — You’ll notice that the schema name has been auto created for you. By convention, this is `dbt_`. This is the schema connected directly to your development environment, and it's where your models will be built when running dbt within the Cloud IDE. + * **Target name** — Leave as the default. + * **Threads** — Leave as 4. This is the number of simultaneous connects that dbt Cloud will make to build models concurrently. + + + +6. Click **Test Connection**. This verifies that dbt Cloud can access your Snowflake account. +7. If the connection test succeeds, click **Next**. If it fails, you may need to check your Snowflake settings and credentials. + + + + +## Set up a dbt Cloud managed repository +If you used Partner Connect, you can skip to [initializing your dbt project](#initialize-your-dbt-project-and-start-developing) as the Partner Connect provides you with a managed repository. Otherwise, you will need to create your repository connection. + + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize your project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit`. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **+ Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from raw.jaffle_shop.customers + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +3. Click the **...** next to the `models` directory, then select **Create file**. +4. Name the file `customers.sql`, then click **Create**. +5. Copy the following query into the file and click **Save**. +```sql +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from raw.jaffle_shop.customers + +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from raw.jaffle_shop.orders + +), + +customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + +) + +select * from final +``` + +6. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from raw.jaffle_shop.customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from raw.jaffle_shop.orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders using (customer_id) + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + +## Build models on top of sources + +Sources make it possible to name and describe the data loaded into your warehouse by your extract and load tools. By declaring these tables as sources in dbt, you can: +- select from source tables in your models using the `{{ source() }}` function, helping define the lineage of your data +- test your assumptions about your source data +- calculate the freshness of your source data + +1. Create a new YML file `models/sources.yml`. +2. Declare the sources by copying the following into the file and clicking **Save**. + + + + ```yml + version: 2 + + sources: + - name: jaffle_shop + description: This is a replica of the Postgres database used by our app + database: raw + schema: jaffle_shop + tables: + - name: customers + description: One record per customer. + - name: orders + description: One record per order. Includes cancelled and deleted orders. + ``` + + + +3. Edit the `models/stg_customers.sql` file to select from the `customers` table in the `jaffle_shop` source. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from {{ source('jaffle_shop', 'customers') }} + ``` + + + +4. Edit the `models/stg_orders.sql` file to select from the `orders` table in the `jaffle_shop` source. + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from {{ source('jaffle_shop', 'orders') }} + ``` + + + +5. Execute `dbt run`. + + The results of your `dbt run` will be exactly the same as the previous step. Your `stg_cusutomers` and `stg_orders` + models will still query from the same raw data source in Snowflake. By using `source`, you can + test and document your raw data and also understand the lineage of your sources. + + + + + diff --git a/website/docs/quickstarts/starburst-galaxy-qs.md b/website/docs/quickstarts/starburst-galaxy-qs.md new file mode 100644 index 00000000000..33228710509 --- /dev/null +++ b/website/docs/quickstarts/starburst-galaxy-qs.md @@ -0,0 +1,416 @@ +--- +title: "Quickstart for dbt Cloud and Starburst Galaxy" +id: "starburst-galaxy" +platform: 'dbt-cloud' +icon: 'starburst' +hide_table_of_contents: true +--- +## Introduction + +In this quickstart guide, you'll learn how to use dbt Cloud with [Starburst Galaxy](https://www.starburst.io/platform/starburst-galaxy/). It will show you how to: + +- Load data into the Amazon S3 bucket. This guide uses AWS as the cloud service provider for demonstrative purposes. Starburst Galaxy also [supports other data sources](https://docs.starburst.io/starburst-galaxy/catalogs/index.html) such as Google Cloud, Microsoft Azure, and more. +- Connect Starburst Galaxy to the Amazon S3 bucket. +- Create tables with Starburst Galaxy. +- Connect dbt Cloud to Starburst Galaxy. +- Take a sample query and turn it into a model in your dbt project. A model in dbt is a select statement. +- Add tests to your models. +- Document your models. +- Schedule a job to run. +- Connect to multiple data sources in addition to your S3 bucket. + +:::tip Videos for you +You can check out [dbt Fundamentals](https://courses.getdbt.com/courses/fundamentals) for free if you're interested in course learning with videos. + +You can also watch the [Build Better Data Pipelines with dbt and Starburst](https://www.youtube.com/watch?v=tfWm4dWgwRg) YouTube video produced by Starburst Data, Inc. +::: + +### Prerequisites + +- You have a [multi-tenant](/docs/cloud/about-cloud/regions-ip-addresses) deployment in [dbt Cloud](https://www.getdbt.com/signup/). For more information, refer to [Tenancy](/docs/cloud/about-cloud/tenancy). +- You have a [Starburst Galaxy account](https://www.starburst.io/platform/starburst-galaxy/). If you don't, you can start a free trial. Refer to the [getting started guide](https://docs.starburst.io/starburst-galaxy/get-started.html) in the Starburst Galaxy docs for further setup details. +- You have an AWS account with permissions to upload data to an S3 bucket. +- For Amazon S3 authentication, you will need either an AWS access key and AWS secret key with access to the bucket, or you will need a cross account IAM role with access to the bucket. For details, refer to these Starburst Galaxy docs: + - [AWS access and secret key instructions](https://docs.starburst.io/starburst-galaxy/security/external-aws.html#aws-access-and-secret-key) + - [Cross account IAM role](https://docs.starburst.io/starburst-galaxy/security/external-aws.html#role) + + +### Related content + +- [dbt Courses](https://courses.getdbt.com/collections) +- [dbt Cloud CI job](/docs/deploy/continuous-integration) +- [Job notifications](/docs/deploy/job-notifications) +- [Source freshness](/docs/deploy/source-freshness) +- [SQL overview for Starburst Galaxy](https://docs.starburst.io/starburst-galaxy/sql/index.html) + +## Load data to an Amazon S3 bucket {#load-data-to-s3} + +Using Starburst Galaxy, you can create tables and also transform them with dbt. Start by loading the Jaffle Shop data (provided by dbt Labs) to your Amazon S3 bucket. Jaffle Shop is a fictional cafe selling food and beverages in several US cities. + +1. Download these CSV files to your local machine: + + - [jaffle_shop_customers.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_customers.csv) + - [jaffle_shop_orders.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/jaffle_shop_orders.csv) + - [stripe_payments.csv](https://dbt-tutorial-public.s3-us-west-2.amazonaws.com/stripe_payments.csv) +2. Upload these files to S3. For details, refer to [Upload objects](https://docs.aws.amazon.com/AmazonS3/latest/userguide/upload-objects.html) in the Amazon S3 docs. + + When uploading these files, you must create the following folder structure and upload the appropriate file to each folder: + + ``` + + dbt-quickstart (folder) + jaffle-shop-customers (folder) + jaffle_shop_customers.csv (file) + jaffle-shop-orders (folder) + jaffle_shop_orders.csv (file) + stripe-payments (folder) + stripe-payments.csv (file) + ``` + +## Connect Starburst Galaxy to the Amazon S3 bucket {#connect-to-s3-bucket} +If your Starburst Galaxy instance is not already connected to your S3 bucket, you need to create a cluster, configure a catalog that allows Starburst Galaxy to connect to the S3 bucket, add the catalog to your new cluster, and configure privilege settings. + +In addition to Amazon S3, Starburst Galaxy supports many other data sources. To learn more about them, you can refer to the [Catalogs overview](https://docs.starburst.io/starburst-galaxy/catalogs/index.html) in the Starburst Galaxy docs. + +1. Create a cluster. Click **Clusters** on the left sidebar of the Starburst Galaxy UI, then click **Create cluster** in the main body of the page. +2. In the **Create a new cluster** modal, you only need to set the following options. You can use the defaults for the other options. + - **Cluster name** — Type a name for your cluster. + - **Cloud provider region** — Select the AWS region. + + When done, click **Create cluster**. + +3. Create a catalog. Click **Catalogs** on the left sidebar of the Starburst Galaxy UI, then click **Create catalog** in the main body of the page. +4. On the **Create a data source** page, select the Amazon S3 tile. +5. In the **Name and description** section of the **Amazon S3** page, fill out the fields. +6. In the **Authentication to S3** section of the **Amazon S3** page, select the [AWS (S3) authentication mechanism](#prerequisites) you chose to connect with. +7. In the **Metastore configuration** section, set these options: + - **Default S3 bucket name** — Enter the name of your S3 bucket you want to access. + - **Default directory name** — Enter the folder name of where the Jaffle Shop data lives in the S3 bucket. This is the same folder name you used in [Load data to an Amazon S3 bucket](#load-data-to-s3). + - **Allow creating external tables** — Enable this option. + - **Allow writing to external tables** — Enable this option. + + The **Amazon S3** page should look similar to this, except for the **Authentication to S3** section which is dependant on your setup: + + + +8. Click **Test connection**. This verifies that Starburst Galaxy can access your S3 bucket. +9. Click **Connect catalog** if the connection test passes. + + +10. On the **Set permissions** page, click **Skip**. You can add permissions later if you want. +11. On the **Add to cluster** page, choose the cluster you want to add the catalog to from the dropdown and click **Add to cluster**. +12. Add the location privilege for your S3 bucket to your role in Starburst Galaxy. Click **Access control > Roles and privileges** on the left sidebar of the Starburst Galaxy UI. Then, in the **Roles** table, click the role name **accountadmin**. + + If you're using an existing Starburst Galaxy cluster and don't have access to the accountadmin role, then select a role that you do have access to. + + To learn more about access control, refer to [Access control](https://docs.starburst.io/starburst-galaxy/security/access-control.html) in the Starburst Galaxy docs. +13. On the **Roles** page, click the **Privileges** tab and click **Add privilege**. +14. On the **Add privilege** page, set these options: + - **What would you like to modify privileges for?** — Choose **Location**. + - **Enter a storage location provide** — Enter the storage location of _your S3 bucket_ and the folder of where the Jaffle Shop data lives. Make sure to include the `/*` at the end of the location. + - **Create SQL** — Enable the option. + + When done, click **Add privileges**. + + + +## Create tables with Starburst Galaxy +To query the Jaffle Shop data with Starburst Galaxy, you need to create tables using the Jaffle Shop data that you [loaded to your S3 bucket](#load-data-to-s3). You can do this (and run any SQL statement) from the [query editor](https://docs.starburst.io/starburst-galaxy/query/query-editor.html). + +1. Click **Query > Query editor** on the left sidebar of the Starburst Galaxy UI. The main body of the page is now the query editor. +2. Configure the query editor so it queries your S3 bucket. In the upper right corner of the query editor, select your cluster in the first gray box and select your catalog in the second gray box: + + + +3. Copy and paste these queries into the query editor. Then **Run** each query individually. + + Replace `YOUR_S3_BUCKET_NAME` with the name of your S3 bucket. These queries create a schema named `jaffle_shop` and also create the `jaffle_shop_customers`, `jaffle_shop_orders`, and `stripe_payments` tables: + + ```sql + CREATE SCHEMA jaffle_shop WITH (location='s3://YOUR_S3_BUCKET_NAME/dbt-quickstart/'); + + CREATE TABLE jaffle_shop.jaffle_shop_customers ( + id VARCHAR, + first_name VARCHAR, + last_name VARCHAR + ) + + WITH ( + external_location = 's3://YOUR_S3_BUCKET_NAME/dbt-quickstart/jaffle-shop-customers/', + format = 'csv', + type = 'hive', + skip_header_line_count=1 + + ); + + CREATE TABLE jaffle_shop.jaffle_shop_orders ( + + id VARCHAR, + user_id VARCHAR, + order_date VARCHAR, + status VARCHAR + + ) + + WITH ( + external_location = 's3://YOUR_S3_BUCKET_NAME/dbt-quickstart/jaffle-shop-orders/', + format = 'csv', + type = 'hive', + skip_header_line_count=1 + ); + + CREATE TABLE jaffle_shop.stripe_payments ( + + id VARCHAR, + order_id VARCHAR, + paymentmethod VARCHAR, + status VARCHAR, + amount VARCHAR, + created VARCHAR + ) + + WITH ( + + external_location = 's3://YOUR_S3_BUCKET_NAME/dbt-quickstart/stripe-payments/', + format = 'csv', + type = 'hive', + skip_header_line_count=1 + + ); + ``` +4. When the queries are done, you can see the following hierarchy on the query editor's left sidebar: + + + +5. Verify that the tables were created successfully. In the query editor, run the following queries: + + ```sql + select * from jaffle_shop.jaffle_shop_customers; + select * from jaffle_shop.jaffle_shop_orders; + select * from jaffle_shop.stripe_payments; + ``` + +## Connect dbt Cloud to Starburst Galaxy + +1. Make sure you are still logged in to [Starburst Galaxy](https://galaxy.starburst.io/login). +2. If you haven’t already, set your account’s role to accountadmin. Click your email address in the upper right corner, choose **Switch role** and select **accountadmin**. + + If this role is not listed for you, choose the role you selected in [Connect Starburst Galaxy to the Amazon S3 bucket](#connect-to-s3-bucket) when you added location privilege for your S3 bucket. +3. Click **Clusters** on the left sidebar. +4. Find your cluster in the **View clusters** table and click **Connection info**. Choose **dbt** from the **Select client** dropdown. Keep the **Connection information** modal open. You will use details from that modal in dbt Cloud. +5. In another browser tab, log in to [dbt Cloud](https://cloud.getdbt.com/). +6. Create a new project in dbt Cloud. From Account settings (using the gear menu in the top right corner), click **+ New Project**. +7. Enter a project name and click **Continue**. +8. Choose **Starburst** as your connection and click **Next**. +9. Enter the **Settings** for your new project: + - **Host** – The **Host** value from the **Connection information** modal in your Starburst Galaxy tab. + - **Port** – 443 (which is the default) +10. Enter the **Development Credentials** for your new project: + - **User** – The **User** value from the **Connection information** modal in your Starburst Galaxy tab. Make sure to use the entire string, including the account's role which is the `/` and all the characters that follow. If you don’t include it, your default role is used and that might not have the correct permissions for project development. + - **Password** – The password you use to log in to your Starburst Galaxy account. + - **Database** – The Starburst catalog you want to save your data to (for example, when writing new tables). For future reference, database is synonymous to catalog between dbt Cloud and Starburst Galaxy. + - Leave the remaining options as is. You can use their default values. +11. Click **Test Connection**. This verifies that dbt Cloud can access your Starburst Galaxy cluster. +12. Click **Next** if the test succeeded. If it failed, you might need to check your Starburst Galaxy settings and credentials. + +## Set up a dbt Cloud managed repository + + +## Initialize your dbt project​ and start developing +Now that you have a repository configured, you can initialize your project and start development in dbt Cloud: + +1. Click **Start developing in the IDE**. It might take a few minutes for your project to spin up for the first time as it establishes your git connection, clones your repo, and tests the connection to the warehouse. +2. Above the file tree to the left, click **Initialize dbt project**. This builds out your folder structure with example models. +3. Make your initial commit by clicking **Commit and sync**. Use the commit message `initial commit` and click **Commit**. This creates the first commit to your managed repo and allows you to open a branch where you can add new dbt code. +4. You can now directly query data from your warehouse and execute `dbt run`. You can try this out now: + - Click **+ Create new file**, add this query to the new file, and click **Save as** to save the new file: + ```sql + select * from dbt_quickstart.jaffle_shop.jaffle_shop_customers + ``` + - In the command line bar at the bottom, enter `dbt run` and click **Enter**. You should see a `dbt run succeeded` message. + +## Build your first model +1. Under **Version Control** on the left, click **Create branch**. You can name it `add-customers-model`. You need to create a new branch since the main branch is set to read-only mode. +3. Click the **...** next to the `models` directory, then select **Create file**. +4. Name the file `customers.sql`, then click **Create**. +5. Copy the following query into the file and click **Save**. + +```sql +with customers as ( + + select + id as customer_id, + first_name, + last_name + + from dbt_quickstart.jaffle_shop.jaffle_shop_customers +), + +orders as ( + + select + id as order_id, + user_id as customer_id, + order_date, + status + + from dbt_quickstart.jaffle_shop.jaffle_shop_orders +), + + +customer_orders as ( + + select + customer_id, + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + group by 1 +), + +final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + left join customer_orders on customers.customer_id = customer_orders.customer_id +) +select * from final + +``` + +6. Enter `dbt run` in the command prompt at the bottom of the screen. You should get a successful run and see the three models. + +Later, you can connect your business intelligence (BI) tools to these views and tables so they only read cleaned up data rather than raw data in your BI tool. + +#### FAQs + + + + + + + +## Change the way your model is materialized + + + +## Delete the example models + + + +## Build models on top of other models + + + +1. Create a new SQL file, `models/stg_customers.sql`, with the SQL from the `customers` CTE in our original query. +2. Create a second new SQL file, `models/stg_orders.sql`, with the SQL from the `orders` CTE in our original query. + + + + ```sql + select + id as customer_id, + first_name, + last_name + + from dbt_quickstart.jaffle_shop.jaffle_shop_customers + ``` + + + + + + ```sql + select + id as order_id, + user_id as customer_id, + order_date, + status + + from dbt_quickstart.jaffle_shop.jaffle_shop_orders + ``` + + + +3. Edit the SQL in your `models/customers.sql` file as follows: + + + + ```sql + with customers as ( + + select * from {{ ref('stg_customers') }} + + ), + + orders as ( + + select * from {{ ref('stg_orders') }} + + ), + + customer_orders as ( + + select + customer_id, + + min(order_date) as first_order_date, + max(order_date) as most_recent_order_date, + count(order_id) as number_of_orders + + from orders + + group by 1 + + ), + + final as ( + + select + customers.customer_id, + customers.first_name, + customers.last_name, + customer_orders.first_order_date, + customer_orders.most_recent_order_date, + coalesce(customer_orders.number_of_orders, 0) as number_of_orders + + from customers + + left join customer_orders on customers.customer_id = customer_orders.customer_id + + ) + + select * from final + + ``` + + + +4. Execute `dbt run`. + + This time, when you performed a `dbt run`, separate views/tables were created for `stg_customers`, `stg_orders` and `customers`. dbt inferred the order to run these models. Because `customers` depends on `stg_customers` and `stg_orders`, dbt builds `customers` last. You do not need to explicitly define these dependencies. + +#### FAQs {#faq-2} + + + + + + + + + + +## Connect to multiple data sources +This quickstart focuses on using dbt Cloud to run models against a data lake (S3) by using Starburst Galaxy as the query engine. In most real world scenarios, the data that is needed for running models is actually spread across multiple data sources and is stored in a variety of formats. With Starburst Galaxy, Starburst Enterprise, and Trino, you can run your models on any of the data you need, no matter where it is stored. + +If you want to try this out, you can refer to the [Starburst Galaxy docs](https://docs.starburst.io/starburst-galaxy/catalogs/) to add more data sources and load the Jaffle Shop data into the source you select. Then, extend your models to query the new data source and the data source you created in this quickstart. diff --git a/website/docs/reference/advanced-config-usage.md b/website/docs/reference/advanced-config-usage.md index 799bc177d16..e2660d066c3 100644 --- a/website/docs/reference/advanced-config-usage.md +++ b/website/docs/reference/advanced-config-usage.md @@ -17,7 +17,7 @@ select ... While dbt provides an alias for any core configurations (e.g. you should use `pre_hook` instead of `pre-hook` in a config block), your dbt project may contain custom configurations without aliases. -If you want to specify these configurations in side of a model, use the altenative config block syntax: +If you want to specify these configurations inside of a model, use the alternative config block syntax: diff --git a/website/docs/reference/analysis-properties.md b/website/docs/reference/analysis-properties.md index e2dce1a7920..fbc7b05538f 100644 --- a/website/docs/reference/analysis-properties.md +++ b/website/docs/reference/analysis-properties.md @@ -2,7 +2,7 @@ title: Analysis properties --- -We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](analysis-paths) configuration. +We recommend you define analysis properties in your `analyses/` directory, which is illustrated in the [`analysis-paths`](/reference/project-configs/analysis-paths) configuration. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `analyses/` or `models/` directory. @@ -13,14 +13,14 @@ version: 2 analyses: - name: # required - [description](description): + [description](/reference/resource-properties/description): [docs](/reference/resource-configs/docs): show: true | false config: - [tags](resource-configs/tags): | [] + [tags](/reference/resource-configs/tags): | [] columns: - name: - [description](description): + [description](/reference/resource-properties/description): - name: ... # declare properties of additional columns - name: ... # declare properties of additional analyses @@ -28,10 +28,3 @@ analyses: ``` - - - - -* `v0.16.0`: The ability to declare analysis properties was introduced. - - diff --git a/website/docs/reference/artifacts/catalog-json.md b/website/docs/reference/artifacts/catalog-json.md index d5788f6f250..44a3f980c60 100644 --- a/website/docs/reference/artifacts/catalog-json.md +++ b/website/docs/reference/artifacts/catalog-json.md @@ -1,16 +1,17 @@ --- -title: Catalog +title: "Catalog JSON file" +sidebar_label: "Catalog" --- -_Current schema_: [`v1`](https://schemas.getdbt.com/dbt/catalog/v1.json) +**Current schema**: [`v1`](https://schemas.getdbt.com/dbt/catalog/v1.json) -_Produced by:_ `dbt docs generate` +**Produced by:** [`docs generate`](/reference/commands/cmd-docs) -This file contains information from your about the tables and views produced and defined by the resources in your project. Today, dbt uses this file to populate metadata, such as column types and statistics, in the [docs site](documentation). +This file contains information from your about the tables and views produced and defined by the resources in your project. Today, dbt uses this file to populate metadata, such as column types and statistics, in the [docs site](/docs/collaborate/documentation). ### Top-level keys -- [`metadata`](dbt-artifacts#common-metadata) +- [`metadata`](/reference/artifacts/dbt-artifacts#common-metadata) - `nodes`: Dictionary containing information about database objects corresponding to dbt models, seeds, and snapshots. - `sources`: Dictionary containing information about database objects corresponding to dbt sources. - `errors`: Errors received while running metadata queries during `dbt docs generate`. @@ -18,7 +19,7 @@ This file contains information from your about the ### Resource details Within `sources` and `nodes`, each dictionary key is a resource `unique_id`. Each nested resource contains: -- `unique_id`: `..`, same as dictionary key, maps to `nodes` and `sources` in the [manifest](manifest-json) +- `unique_id`: `..`, same as dictionary key, maps to `nodes` and `sources` in the [manifest](/reference/artifacts/manifest-json) - `metadata` - `type`: table, view, etc. - `database` diff --git a/website/docs/reference/artifacts/dbt-artifacts.md b/website/docs/reference/artifacts/dbt-artifacts.md index bc93aa8cf6e..859fde7c908 100644 --- a/website/docs/reference/artifacts/dbt-artifacts.md +++ b/website/docs/reference/artifacts/dbt-artifacts.md @@ -1,13 +1,17 @@ --- -title: Overview +title: "About dbt artifacts" +sidebar_label: "About dbt artifacts" --- -With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: -- [documentation](documentation) -- [state](understanding-state) +With every invocation, dbt generates and saves one or more *artifacts*. Several of these are files (`semantic_manifest.json`, `manifest.json`, `catalog.json`, `run_results.json`, and `sources.json`) that are used to power: + +- [documentation](/docs/collaborate/documentation) +- [state](/reference/node-selection/syntax#about-node-selection) - [visualizing source freshness](/docs/build/sources#snapshotting-source-data-freshness) They could also be used to: + +- gain insights into your [dbt Semantic Layer](/docs/use-dbt-semantic-layer/dbt-sl) - calculate project-level test coverage - perform longitudinal analysis of run timing - identify historical changes in structure @@ -18,15 +22,14 @@ dbt has produced artifacts since the release of dbt-docs in v0.11.0. Starting in ## When are artifacts produced? Most dbt commands (and corresponding RPC methods) produce artifacts: -- [manifest](manifest-json): produced by `build`, `compile`, `run`, `test`, `docs generate`, `ls` -- [run results](run-results-json): produced by `build`, `run`, `test`, `seed`, `snapshot`, `docs generate` +- [semantic manifest](/docs/dbt-cloud-apis/sl-manifest): Lives in the `/target` directory of your dbt project and stores various artifacts (such as compiled models and tests) generated during the execution of your project. +- [manifest](/reference/artifacts/manifest-json): produced by commands that read and understand your project +- [run results](/reference/artifacts/run-results-json): produced by commands that run, compile, or catalog nodes in your DAG - [catalog](catalog-json): produced by `docs generate` -- [sources](sources-json): produced by `source freshness` +- [sources](/reference/artifacts/sources-json): produced by `source freshness` ## Common metadata -New in v0.19.0 - All artifacts produced by dbt include a `metadata` dictionary with these properties: - `dbt_version`: Version of dbt that produced this artifact. @@ -34,10 +37,11 @@ All artifacts produced by dbt include a `metadata` dictionary with these propert - `generated_at`: Timestamp in UTC when this artifact was produced. - `adapter_type`: The adapter (database), e.g. `postgres`, `spark`, etc. - `env`: Any environment variables prefixed with `DBT_ENV_CUSTOM_ENV_` will be included in a dictionary, with the prefix-stripped variable name as its key. -- [`invocation_id`](invocation_id): Unique identifier for this dbt invocation +- [`invocation_id`](/reference/dbt-jinja-functions/invocation_id): Unique identifier for this dbt invocation In the manifest, the `metadata` may also include: -- `send_anonymous_usage_stats`: Whether this invocation sent [anonymous usage statistics](https://docs.getdbt.com/reference/profiles.yml/#send_anonymous_usage_stats) while executing. +- `send_anonymous_usage_stats`: Whether this invocation sent [anonymous usage statistics](/reference/global-configs/usage-stats) while executing. +- `project_name`: The `name` defined in the root project's `dbt_project.yml`. (Added in manifest v10 / dbt Core v1.6) - `project_id`: Project identifier, hashed from `project_name`, sent with anonymous usage stats if enabled. - `user_id`: User identifier, stored by default in `~/dbt/.user.yml`, sent with anonymous usage stats if enabled. diff --git a/website/docs/reference/artifacts/manifest-json.md b/website/docs/reference/artifacts/manifest-json.md index 21faaab44c6..47a9849eda5 100644 --- a/website/docs/reference/artifacts/manifest-json.md +++ b/website/docs/reference/artifacts/manifest-json.md @@ -1,46 +1,50 @@ --- -title: Manifest +title: "Manifest JSON file" +sidebar_label: "Manifest" --- -_Current schema_: [`v7`](https://schemas.getdbt.com/dbt/manifest/v7/index.html) +import ManifestVersions from '/snippets/_manifest-versions.md'; -_Produced by:_ -- `dbt compile` -- `dbt run` -- `dbt test` -- `dbt seed` -- `dbt snapshot` -- `dbt docs generate` -- `dbt source freshness` -- `dbt ls` -- `dbt build` + + +**Produced by:** Any command that parses your project. This includes all commands **except** [`deps`](/reference/commands/deps), [`clean`](/reference/commands/clean), [`debug`](/reference/commands/debug), [`init`](/reference/commands/init) This single file contains a full representation of your dbt project's resources (models, tests, macros, etc), including all node configurations and resource properties. Even if you're only running some models or tests, all resources will appear in the manifest (unless they are disabled) with most of their properties. (A few node properties, such as `compiled_sql`, only appear for executed nodes.) -Today, dbt uses this file to populate the [docs site](documentation), and to perform [state comparison](understanding-state). Members of the community have used this file to run checks on how many models have descriptions and tests. +Today, dbt uses this file to populate the [docs site](/docs/collaborate/documentation), and to perform [state comparison](/reference/node-selection/syntax#about-node-selection). Members of the community have used this file to run checks on how many models have descriptions and tests. ### Top-level keys -- [`metadata`](dbt-artifacts#common-metadata) +- [`metadata`](/reference/artifacts/dbt-artifacts#common-metadata) - `nodes`: Dictionary of all analyses, models, seeds, snapshots, and tests. - `sources`: Dictionary of sources. +- `metrics`: Dictionary of metrics. - `exposures`: Dictionary of exposures. +- `groups`: Dictionary of groups. (**Note:** Added in v1.5) - `macros`: Dictionary of macros. - `docs`: Dictionary of `docs` blocks. - `parent_map`: Dictionary that contains the first-order parents of each resource. - `child_map`: Dictionary that contains the first-order children of each resource. -- `selectors`: Expanded dictionary representation of [YAML `selectors`](yaml-selectors). +- `group_map`: Dictionary that maps group names to their resource nodes. +- `selectors`: Expanded dictionary representation of [YAML `selectors`](/reference/node-selection/yaml-selectors). - `disabled`: Array of resources with `enabled: false`. ### Resource details -All resources nested within `nodes`, `sources`, `exposures`, `macros`, and `docs` have the following base properties: +All resources nested within `nodes`, `sources`, `metrics`, `exposures`, `macros`, and `docs` have the following base properties: - `name`: Resource name. - `unique_id`: `..`, same as dictionary key - `package_name`: Name of package that defines this resource. -- `root_path`: Absolute file path of this resource's package. +- `root_path`: Absolute file path of this resource's package. (**Note:** This is removed for most node types in dbt Core v1.4 / manifest v8 to reduce duplicative information across nodes, but it is still present for seeds.) - `path`: Relative file path of this resource's definition within its "resource path" (`model-paths`, `seed-paths`, etc.). - `original_file_path`: Relative file path of this resource's definition, including its resource path. Each has several additional properties related to its resource type. + +### dbt JSON Schema +You can refer to [dbt JSON Schema](https://schemas.getdbt.com/) for info on describing and consuming dbt generated artifacts. + +**Note**: The `manifest.json` version number is related to (but not _equal_ to) your dbt version, so you _must_ use the correct `manifest.json` version for your dbt version. To find the correct `manifest.json` version, select the dbt version on the top navigation (such as `v1.5`). + +Refer to the table at the beginning of [this page](/reference/artifacts/manifest-json) to understand how the Manifest version matches the dbt version. diff --git a/website/docs/reference/artifacts/other-artifacts.md b/website/docs/reference/artifacts/other-artifacts.md index dfcdccbecc9..d776bc8a099 100644 --- a/website/docs/reference/artifacts/other-artifacts.md +++ b/website/docs/reference/artifacts/other-artifacts.md @@ -1,23 +1,42 @@ --- -title: Other Artifacts +title: "Other artifact files" +sidebar_label: "Other artifacts" --- ### index.html -_Produced by:_ `dbt docs generate` +**Produced by:** [`docs generate`](/reference/commands/cmd-docs) -This file is the skeleton of the [auto-generated dbt documentation website](documentation). The contents of the site are populated by the [manifest](manifest-json) and [catalog](catalog-json). +This file is the skeleton of the [auto-generated dbt documentation website](/docs/collaborate/documentation). The contents of the site are populated by the [manifest](/reference/artifacts/manifest-json) and [catalog](catalog-json). Note: the source code for `index.json` comes from the [dbt-docs repo](https://github.com/dbt-labs/dbt-docs). Head over there if you want to make a bug report, suggestion, or contribution relating to the documentation site. ### partial_parse.msgpack -_Produced by: all commands_ +**Produced by:** [manifest commands](/reference/artifacts/manifest-json) + [`parse`](/reference/commands/parse) -This file is used to store a compressed representation of files dbt has parsed. If you have [partial parsing](parsing#partial-parsing) enabled, dbt will use this file to identify the files that have changed and avoid re-parsing the rest. +This file is used to store a compressed representation of files dbt has parsed. If you have [partial parsing](/reference/parsing#partial-parsing) enabled, dbt will use this file to identify the files that have changed and avoid re-parsing the rest. ### graph.gpickle -_Produced by: all commands_ +**Produced by:** commands supporting [node selection](/reference/node-selection/syntax) Stores the networkx representation of the dbt resource DAG. + +### graph_summary.json + + + +:::info New functionality +This functionality is new in v1.6. +::: + + + +**Produced by:** [manifest commands](/reference/artifacts/manifest-json) + +This file is useful for investigating performance issues in dbt Core's graph algorithms. + +It is more anonymized and compact than [`manifest.json`](/reference/artifacts/manifest-json) and [`graph.gpickle`](#graph.gpickle). + +It contains only the `name` and `type` of each node along with IDs of its child nodes (`succ`). It includes that information at two separate points in time: immediately after the graph is linked together (`linked`), and after test edges have been added (`with_test_edges`). diff --git a/website/docs/reference/artifacts/run-results-json.md b/website/docs/reference/artifacts/run-results-json.md index f2ded4f23dc..dd92a9c4e53 100644 --- a/website/docs/reference/artifacts/run-results-json.md +++ b/website/docs/reference/artifacts/run-results-json.md @@ -1,37 +1,46 @@ --- -title: Run Results +title: "Run results JSON file" +sidebar_label: "Run results" --- -_Current schema_: [`v4`](https://schemas.getdbt.com/dbt/run-results/v4/index.html) +**Current schema**: [`v4`](https://schemas.getdbt.com/dbt/run-results/v4/index.html) -_Produced by:_ -- `dbt run` -- `dbt test` -- `dbt seed` -- `dbt snapshot` -- `dbt compile` -- `dbt docs generate` -- `dbt build` + **Produced by:** + [`build`](/reference/commands/build) + [`compile`](/reference/commands/compile) + [`docs generate`](/reference/commands/cmd-docs) + [`run`](/reference/commands/run) + [`seed`](/reference/commands/seed) + [`snapshot`](/reference/commands/snapshot) + [`test`](/reference/commands/test) [`run-operation`](/reference/commands/run-operation) + This file contains information about a completed invocation of dbt, including timing and status info for each node (model, test, etc) that was executed. In aggregate, many `run_results.json` can be combined to calculate average model runtime, test failure rates, the number of record changes captured by snapshots, etc. Note that only executed nodes appear in the run results. If you have multiple run or test steps with different critiera, each will produce different run results. -Note: `dbt source freshness` produces a different artifact, [`sources.json`](sources-json), with similar attributes. +Note: `dbt source freshness` produces a different artifact, [`sources.json`](/reference/artifacts/sources-json), with similar attributes. ### Top-level keys -- [`metadata`](dbt-artifacts#common-metadata) -- `args`: Dictionary of arguments passed to the CLI command or RPC method that produced this artifact. Most useful is `which` (command) or `rpc_method`. This dict excludes null values, and includes default values if they are not null. Equivalent to [`invocation_args_dict`](flags#invocation_args_dict) in the dbt-Jinja context. +- [`metadata`](/reference/artifacts/dbt-artifacts#common-metadata) +- `args`: Dictionary of arguments passed to the CLI command or RPC method that produced this artifact. Most useful is `which` (command) or `rpc_method`. This dict excludes null values, and includes default values if they are not null. Equivalent to [`invocation_args_dict`](/reference/dbt-jinja-functions/flags#invocation_args_dict) in the dbt-Jinja context. - `elapsed_time`: Total invocation time in seconds. - `results`: Array of node execution details. -Each entry in `results` is a [`Result` object](dbt-classes#result-objects), with one difference: Instead of including the entire `node` object, only the `unique_id` is included. (The full `node` object is recorded in [`manifest.json`](manifest-json).) +Each entry in `results` is a [`Result` object](/reference/dbt-classes#result-objects), with one difference: Instead of including the entire `node` object, only the `unique_id` is included. (The full `node` object is recorded in [`manifest.json`](/reference/artifacts/manifest-json).) -- `unique_id`: Unique node identifier, which map results to `nodes` in the [manifest](manifest-json) +- `unique_id`: Unique node identifier, which map results to `nodes` in the [manifest](/reference/artifacts/manifest-json) - `status`: dbt's interpretation of runtime success, failure, or error - `thread_id`: Which thread executed this node? E.g. `Thread-1` - `execution_time`: Total time spent executing this node - `timing`: Array that breaks down execution time into steps (often `compile` + `execute`) -- `adapter_response`: Dictionary of metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. Not populated by tests, as of v0.19.0; we plan to fix in a future release ([dbt#2580](https://github.com/dbt-labs/dbt-core/issues/2580)). - `message`: How dbt will report this result on the CLI, based on information returned from the database + +import RowsAffected from '/snippets/_run-result.md'; + + + + + + diff --git a/website/docs/reference/artifacts/sources-json.md b/website/docs/reference/artifacts/sources-json.md index 8d36d9d2e1e..3b7f7539896 100644 --- a/website/docs/reference/artifacts/sources-json.md +++ b/website/docs/reference/artifacts/sources-json.md @@ -1,27 +1,32 @@ --- -title: Sources +title: "Sources JSON file" +sidebar_label: "Sources" --- -_Current schema_: [`v3`](https://schemas.getdbt.com/dbt/sources/v3/index.html) +**Current schema:** [`v3`](https://schemas.getdbt.com/dbt/sources/v3/index.html) -_Produced by:_ `dbt source freshness` +**Produced by:** [`source freshness`](/reference/commands/source) This file contains information about [sources with freshness checks](/docs/build/sources#checking-source-freshness). Today, dbt Cloud uses this file to power its [Source Freshness visualization](/docs/build/sources#snapshotting-source-data-freshness). ### Top-level keys -- [`metadata`](dbt-artifacts#common-metadata) +- [`metadata`](/reference/artifacts/dbt-artifacts#common-metadata) - `elapsed_time`: Total invocation time in seconds. - `results`: Array of freshness-check execution details. Each entry in `results` is a dictionary with the following keys: -- `unique_id`: Unique source node identifier, which map results to `sources` in the [manifest](manifest-json) +- `unique_id`: Unique source node identifier, which map results to `sources` in the [manifest](/reference/artifacts/manifest-json) - `max_loaded_at`: Max value of `loaded_at_field` timestamp in the source when queried. - `snapshotted_at`: Current timestamp when querying. - `max_loaded_at_time_ago_in_s`: Interval between `max_loaded_at` and `snapshotted_at`, calculated in python to handle timezone complexity. - `criteria`: The freshness threshold(s) for this source, defined in the project. - `status`: The freshness status of this source, based on `max_loaded_at_time_ago_in_s` + `criteria`, reported on the CLI. One of `pass`, `warn`, or `error` if the query succeeds, `runtime error` if the query fails. -- `adapter_response`: Dictionary of information returned from the database, which varies by adapter. Not populated by source freshness checks, as of v0.19.0; we plan to fix in a future release ([dbt#2580](https://github.com/dbt-labs/dbt-core/issues/2580)). - `execution_time`: Total time spent checking freshness for this source - `timing`: Array that breaks down execution time into steps (`compile` + `execute`) + +import RowsAffected from '/snippets/_run-result.md'; + + + diff --git a/website/docs/reference/commands/build.md b/website/docs/reference/commands/build.md index 50b443246c3..846810c1652 100644 --- a/website/docs/reference/commands/build.md +++ b/website/docs/reference/commands/build.md @@ -1,14 +1,9 @@ --- -title: "build" +title: "About dbt build command" +sidebar_label: "build" id: "build" --- - - -- Introduced in **v0.21.0** - - - The `dbt build` command will: - run models - test tests @@ -19,14 +14,14 @@ In DAG order, for selected resources or an entire project. ### Details -**Artifacts:** The `build` task will write a single [manifest](artifacts/manifest-json) and a single [run results artifact](artifacts/run-results-json). The run results will include information about all models, tests, seeds, and snapshots that were selected to build, combined into one file. +**Artifacts:** The `build` task will write a single [manifest](/reference/artifacts/manifest-json) and a single [run results artifact](/reference/artifacts/run-results-json). The run results will include information about all models, tests, seeds, and snapshots that were selected to build, combined into one file. **Skipping on failures:** Tests on upstream resources will block downstream resources from running, and a test failure will cause those downstream resources to skip entirely. E.g. If `model_b` depends on `model_a`, and a `unique` test on `model_a` fails, then `model_b` will `SKIP`. -- Don't want a test to cause skipping? Adjust its [severity or thresholds](severity) to `warn` instead of `error` +- Don't want a test to cause skipping? Adjust its [severity or thresholds](/reference/resource-configs/severity) to `warn` instead of `error` - In the case of a test with multiple parents, where one parent depends on the other (e.g. a `relationships` test between `model_a` + `model_b`), that test will block-and-skip children of the most-downstream parent only (`model_b`). **Selecting resources:** The `build` task supports standard selection syntax (`--select`, `--exclude`, `--selector`), as well as a `--resource-type` flag that offers a final filter (just like `list`). Whichever resources are selected, those are the ones that `build` will run/test/snapshot/seed. -- Remember that tests support indirect selection, so `dbt build -s model_a` will both run _and_ test `model_a`. What does that mean? Any tests that directly depend on `model_a` will be included, so long as those tests don't also depend on other unselected parents. See [test selection](test-selection-examples) for details and examples. +- Remember that tests support indirect selection, so `dbt build -s model_a` will both run _and_ test `model_a`. What does that mean? Any tests that directly depend on `model_a` will be included, so long as those tests don't also depend on other unselected parents. See [test selection](/reference/node-selection/test-selection-examples) for details and examples. **Flags:** The `build` task supports all the same flags as `run`, `test`, `snapshot`, and `seed`. For flags that are shared between multiple tasks (e.g. `--full-refresh`), `build` will use the same value for all selected resource types (e.g. both models and seeds will be full refreshed). diff --git a/website/docs/reference/commands/clean.md b/website/docs/reference/commands/clean.md index 11b9b18ceaa..23a3f6080ce 100644 --- a/website/docs/reference/commands/clean.md +++ b/website/docs/reference/commands/clean.md @@ -1,14 +1,9 @@ --- -title: "clean" +title: "About dbt clean command" +sidebar_label: "clean" id: "clean" --- - +`dbt clean` is a utility function that deletes all folders specified in the [`clean-targets`](/reference/project-configs/clean-targets) list specified in `dbt_project.yml`. You can use this to delete the `dbt_packages` and `target` directories. -- **v1.0.0:** `dbt_modules` has been replaced by `dbt_packages` by default for the [clean-target](clean-targets) for packages. - - - -`dbt clean` is a utility function that deletes all folders specified in the `clean-targets` list specified in `dbt_project.yml`. You can use this to delete the `dbt_packages` and `target` directories. - -To avoid complex permissions issues and potentially deleting crucial aspects of the remote file system without access to fix them, this command does not work when interfacing with the RPC server that powers the dbt Cloud IDE. Instead, when working in dbt Cloud, the `dbt deps` command cleans before it installs packages automatically. The `target` folder can be manually deleted from the sidbear file tree if needed. +To avoid complex permissions issues and potentially deleting crucial aspects of the remote file system without access to fix them, this command does not work when interfacing with the RPC server that powers the dbt Cloud IDE. Instead, when working in dbt Cloud, the `dbt deps` command cleans before it installs packages automatically. The `target` folder can be manually deleted from the sidebar file tree if needed. diff --git a/website/docs/reference/commands/clone.md b/website/docs/reference/commands/clone.md new file mode 100644 index 00000000000..ea3e570447d --- /dev/null +++ b/website/docs/reference/commands/clone.md @@ -0,0 +1,39 @@ +--- +title: "About dbt clone command" +sidebar_label: "clone" +id: "clone" +--- + +The `dbt clone` command clones selected nodes from the [specified state](/reference/node-selection/syntax#establishing-state) to the target schema(s). This command makes use of the `clone` materialization: +- If your data platform supports zero-copy cloning of tables, and this model exists as a table in the source environment, dbt will create it in your target environment as a clone +- Otherwise, dbt will create a simple pointer view (`select * from` the source object) +- By default, `dbt clone` will not recreate pre-existing relations in the current target. To override this, use the `--full-refresh` flag. +- You may want to specify a higher number of [threads](/docs/running-a-dbt-project/using-threads) to decrease execution time since individual clone statements are independent of one another. + +The `clone` command is useful for: +- blue/green continuous deployment (on data warehouses that support zero-copy cloning tables) +- cloning current production state into development schema(s) +- handling incremental models in dbt Cloud CI jobs (on data warehouses that support zero-copy cloning tables) +- testing code changes on downstream dependencies in your BI tool + +```bash +# clone all of my models from specified state to my target schema(s) +dbt clone --state path/to/artifacts + +# clone one_specific_model of my models from specified state to my target schema(s) +dbt clone --select "one_specific_model" --state path/to/artifacts + +# clone all of my models from specified state to my target schema(s) and recreate all pre-existing relations in the current target +dbt clone --state path/to/artifacts --full-refresh + +# clone all of my models from specified state to my target schema(s), running up to 50 clone statements in parallel +dbt clone --state path/to/artifacts --threads 50 +``` + +### When to use `dbt clone` instead of [deferral](/reference/node-selection/defer)? + +Unlike deferral, `dbt clone` requires some compute and creation of additional objects in your data warehouse. In many cases, deferral is a cheaper and simpler alternative to `dbt clone`. However, `dbt clone` covers additional use cases where deferral may not be possible. + +For example, by creating actual data warehouse objects, `dbt clone` allows you to test out your code changes on downstream dependencies _outside of dbt_ (such as a BI tool). + +As another example, you could `clone` your modified incremental models as the first step of your dbt Cloud CI job to prevent costly `full-refresh` builds for warehouses that support zero-copy cloning. diff --git a/website/docs/reference/commands/cmd-docs.md b/website/docs/reference/commands/cmd-docs.md index 33a46582bad..bc4840464b8 100644 --- a/website/docs/reference/commands/cmd-docs.md +++ b/website/docs/reference/commands/cmd-docs.md @@ -1,5 +1,6 @@ --- -title: "docs" +title: "About dbt docs commands" +sidebar_label: "docs" id: "cmd-docs" --- @@ -9,24 +10,49 @@ id: "cmd-docs" The command is responsible for generating your project's documentation website by -1. copying the website `index.html` file into the `target/` directory -2. compiling the project to `target/manifest.json` -3. producing the `target/catalog.json` file, which contains metadata about the tables and views produced by the models in your project. +1. Copying the website `index.html` file into the `target/` directory +2. Compiling the resources in your project, so that their `compiled_code` will be included in [`manifest.json`](/reference/artifacts/manifest-json) +3. Running queries against database metadata to produce the [`catalog.json`](/reference/artifacts/catalog-json) file, which contains metadata about the tables and views produced by the models in your project. **Example**: ``` dbt docs generate ``` -Use the `--no-compile` argument to skip re-compilation. When this flag is provided, `dbt docs generate` will only execute steps (1) and (3), as described above. + + +Use the `--select` argument to limit the nodes included within `catalog.json`. When this flag is provided, step (3) will be restricted to the selected nodes. All other nodes will be excluded. Step (2) is unaffected. + +**Example**: +```shell +dbt docs generate --select +orders +``` + + + + +Use the `--no-compile` argument to skip re-compilation. When this flag is provided, `dbt docs generate` will skip step (2) described above. **Example**: ``` dbt docs generate --no-compile ``` + + +Use the `--empty-catalog` argument to skip running the database queries to populate `catalog.json`. When this flag is provided, `dbt docs generate` will skip step (3) described above. + +This is not recommended for production environments, as it means that your documentation will be missing information gleaned from database metadata (the full set of columns in each table, and statistics about those tables). It can speed up `docs generate` in development, when you just want to visualize lineage and other information defined within your project. + +**Example**: +``` +dbt docs generate --empty-catalog +``` + + + ### dbt docs serve -This command starts a webserver on port 8000 to serve your documentation locally and opens the documentation site in your default browser. The webserver is rooted in your `target/` directory. Be sure to run `dbt docs generate` before `dbt docs serve` because the `generate` command produces a [catalog metadata artifact](/reference/artifacts/catalog-json) that the `serve` command depends upon. You will see an error message if the catalog is missing. +This command starts a webserver on port 8080 to serve your documentation locally and opens the documentation site in your default browser. The webserver is rooted in your `target/` directory. Be sure to run `dbt docs generate` before `dbt docs serve` because the `generate` command produces a [catalog metadata artifact](/reference/artifacts/catalog-json) that the `serve` command depends upon. You will see an error message if the catalog is missing. **Usage:** ``` diff --git a/website/docs/reference/commands/compile.md b/website/docs/reference/commands/compile.md index d9f2a48c032..cde65b7c6b6 100644 --- a/website/docs/reference/commands/compile.md +++ b/website/docs/reference/commands/compile.md @@ -1,5 +1,6 @@ --- -title: "compile" +title: "About dbt compile command" +sidebar_label: "compile" id: "compile" --- @@ -9,8 +10,66 @@ The `compile` command is useful for: 1. Visually inspecting the compiled output of model files. This is useful for validating complex jinja logic or macro usage. 2. Manually running compiled SQL. While debugging a model or schema test, it's often useful to execute the underlying `select` statement to find the source of the bug. -3. Compiling `analysis `files. Read more about analysis files [here](analyses). +3. Compiling `analysis` files. Read more about analysis files [here](/docs/build/analyses). -It is _not_ a pre-requisite of `dbt run`. +Some common misconceptions: +- `dbt compile` is _not_ a pre-requisite of `dbt run`, or other building commands. Those commands will handle compilation themselves. +- If you just want dbt to read and validate your project code, without connecting to the data warehouse, use `dbt parse` instead. -## Options + + +### Interactive compile + +Starting in dbt v1.5, `compile` can be "interactive" in the CLI, by displaying the compiled code of a node or arbitrary dbt-SQL query: +- `--select` a specific node _by name_ +- `--inline` an arbitrary dbt-SQL query + +This will log the compiled SQL to the terminal, in addition to writing to the `target/` directory. + +For example: + +```bash +dbt compile --select "stg_payments" +dbt compile --inline "select * from {{ ref('raw_orders') }}" +``` + +returns the following: + + +```bash +dbt compile --select "stg_orders" +21:17:09 Running with dbt=1.5.0-b5 +21:17:09 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups +21:17:09 +21:17:09 Concurrency: 24 threads (target='dev') +21:17:09 +21:17:09 Compiled node 'stg_orders' is: +with source as ( + select * from "jaffle_shop"."main"."raw_orders" + +), + +renamed as ( + + select + id as order_id + user_id as customer_id + order_date + status + + from source + +) + +select * from renamed +``` + + + +The command accesses the data platform to cache-related metadata, and to run introspective queries. Use the flags: +- `--no-populate-cache` to disable the initial cache population. If metadata is needed, it will be a cache miss, requiring dbt to run the metadata query. This is a `dbt` flag, which means you need to add `dbt` as a prefix. For example: `dbt --no-populate-cache`. +- `--no-introspect` to disable [introspective queries](/faqs/warehouse/db-connection-dbt-compile#introspective-queries). dbt will raise an error if a model's definition requires running one. This is a `dbt compile` flag, which means you need to add `dbt compile` as a prefix. For example:`dbt compile --no-introspect`. + + +### FAQs + diff --git a/website/docs/reference/commands/debug.md b/website/docs/reference/commands/debug.md index d58048c211e..4ae5a1d2dd9 100644 --- a/website/docs/reference/commands/debug.md +++ b/website/docs/reference/commands/debug.md @@ -1,11 +1,27 @@ --- -title: "debug" +title: "About dbt debug command" +sidebar_label: "debug" id: "debug" --- -`dbt debug` is a utility function to show debug information. -The `--config-dir` option to `dbt debug` will show you where your `.dbt` configuration directory is located: +`dbt debug` is a utility function to test the database connection and display information for debugging purposes, such as the validity of your project file and your installation of any requisite dependencies (like `git` when you run `dbt deps`). + +*Note: Not to be confused with [debug-level logging](/reference/global-configs/about-global-configs#debug-level-logging) via the `--debug` option which increases verbosity. + +### Example usage + + + +Only test the connection to the data platform and skip the other checks `dbt debug` looks for: + +```shell +$ dbt debug --connection +``` + + + +Show the configured location for the `profiles.yml` file and exit: ```text $ dbt debug --config-dir diff --git a/website/docs/reference/commands/deps.md b/website/docs/reference/commands/deps.md index a52f07bb934..f4f8153c115 100644 --- a/website/docs/reference/commands/deps.md +++ b/website/docs/reference/commands/deps.md @@ -1,16 +1,11 @@ --- -title: "deps" +title: "About dbt deps command" +sidebar_label: "deps" id: "deps" --- `dbt deps` pulls the most recent version of the dependencies listed in your `packages.yml` from git. See [Package-Management](/docs/build/packages) for more information. - - -- **v0.21.0**: dbt package version comparison logging is new! - - - Where relevant, dbt will display up to date and/or latest versions of packages that are listed on dbt Hub. Example below. > This does NOT apply to packages that are installed via git/local @@ -62,3 +57,31 @@ Installing calogica/dbt_date@0.4.0 Updates available for packages: ['tailsdotcom/dbt_artifacts', 'dbt-labs/snowplow'] Update your versions in packages.yml, then run dbt deps ``` + + + +dbt generates the `package-lock.yml` file in the _project_root_ where `packages.yml` is recorded, which contains all the resolved packages, the first time you run `dbt deps`. Each subsequent run records the packages installed in this file. If the subsequent `dbt deps` runs contain no updated packages in `depenedencies.yml` or `packages.yml`, dbt-core installs from `package-lock.yml`. + +When you update the package spec and run `dbt deps` again, the package-lock and package files update accordingly. You can run `dbt deps --lock` to update the `package-lock.yml` with the most recent dependencies from `packages`. + +The `--add` flag allows you to add a package to the `packages.yml` with configurable `--version` and `--source` information. The `--dry-run` flag, when set to `False`(default), recompiles the `package-lock.yml` file after a new package is added to the `packages.yml` file. Set the flag to `True` for the changes to not persist. + +Examples of the `--add` flag: +```shell +# add package from hub (--source arg defaults to "hub") +dbt deps add --package dbt-labs/dbt_utils --version 1.0.0 + +# add package from hub with semantic version +dbt deps add --package dbt-labs/snowplow --version ">=0.7.0,<0.8.0" + +# add package from git +dbt deps add --package https://github.com/fivetran/dbt_amplitude --version v0.3.0 --source git + +# add package from local (--version not required for local) +dbt deps add --package /opt/dbt/redshift --source local + +# add package to packages.yml WITHOUT updating package-lock.yml +dbt deps add --package dbt-labs/dbt_utils --version 1.0.0 --dry-run True + +``` + \ No newline at end of file diff --git a/website/docs/reference/commands/init.md b/website/docs/reference/commands/init.md index c2f67e836cd..ac55717c0ec 100644 --- a/website/docs/reference/commands/init.md +++ b/website/docs/reference/commands/init.md @@ -1,37 +1,47 @@ --- -title: "init" +title: "About dbt init command" +sidebar_label: "init" id: "init" --- -:::info Improved in v1.0! -The `init` command is interactive and responsive like never before. -::: - `dbt init` helps get you started using dbt Core! ## New project If this is your first time ever using the tool, it will: - ask you to name your project -- ask you which database adapter you're using (or to [Supported Data Platforms](supported-data-platforms)) +- ask you which database adapter you're using (or to [Supported Data Platforms](/docs/supported-data-platforms)) - prompt you for each piece of information that dbt needs to connect to that database: things like `account`, `user`, `password`, etc Then, it will: - Create a new folder with your project name and sample files, enough to get you started with dbt -- Create a connection profile on your local machine. The default location is `~/.dbt/profiles.yml`. Read more in [configuring your profile](/docs/get-started/connection-profiles). +- Create a connection profile on your local machine. The default location is `~/.dbt/profiles.yml`. Read more in [configuring your profile](/docs/core/connect-data-platform/connection-profiles). + + + +When using `dbt init` to initialize your project, include the `--profile` flag to specify an existing `profiles.yml` as the `profile:` key to use instead of creating a new one. For example, `dbt init --profile`. + + + +If the profile does not exist in `profiles.yml` or the command is run inside an existing project, the command raises an error. + + ## Existing project If you've just cloned or downloaded an existing dbt project, `dbt init` can still help you set up your connection profile so that you can start working quickly. It will prompt you for connection information, as above, and add a profile (using the `profile` name from the project) to your local `profiles.yml`, or create the file if it doesn't already exist. + ## profile_template.yml `dbt init` knows how to prompt for connection information by looking for a file named `profile_template.yml`. It will look for this file in two places: -- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. See more details in [building-a-new-adapter](/guides/advanced/adapter-development/3-building-a-new-adapter). +- **Adapter plugin:** What's the bare minumum Postgres profile? What's the type of each field, what are its defaults? This information is stored in a file called [`dbt/include/postgres/profile_template.yml`](https://github.com/dbt-labs/dbt-core/blob/main/plugins/postgres/dbt/include/postgres/profile_template.yml). If you're the maintainer of an adapter plugin, we highly recommend that you add a `profile_template.yml` to your plugin, too. See more details in [building-a-new-adapter](/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter). - **Existing project:** If you're the maintainer of an existing project, and you want to help new users get connected to your database quickly and easily, you can include your own custom `profile_template.yml` in the root of your project, alongside `dbt_project.yml`. For common connection attributes, set the values in `fixed`; leave user-specific attributes in `prompts`, but with custom hints and defaults as you'd like. + + ```yml @@ -43,6 +53,9 @@ fixed: type: snowflake warehouse: transforming prompts: + target: + type: string + hint: your desired target name user: type: string hint: yourname@jaffleshop.com @@ -57,9 +70,11 @@ prompts: + + ``` $ dbt init -Running with dbt=1.0.0-b2 +Running with dbt=1.0.0 Setting up your profile. user (yourname@jaffleshop.com): summerintern@jaffleshop.com schema (usually dbt_): dbt_summerintern diff --git a/website/docs/reference/commands/list.md b/website/docs/reference/commands/list.md index 745c12d3862..5caabdc2b2e 100644 --- a/website/docs/reference/commands/list.md +++ b/website/docs/reference/commands/list.md @@ -1,32 +1,33 @@ --- -title: "ls (list)" +title: "About dbt ls (list) command" +sidebar_label: "ls (list)" +description: "Read this guide on how dbt's ls (list) command can be used to list resources in your dbt project." id: "list" --- -## Overview - -The `dbt ls` command lists resources in your dbt project. It accepts selector arguments that are similar to those provided in [dbt run](run). `dbt list` is an alias for `dbt ls`. While `dbt ls` will read your [connection profile](/docs/get-started/connection-profiles) to resolve [`target`](dbt-jinja-functions/target)-specific logic, this command will not connect to your database or run any queries. +The `dbt ls` command lists resources in your dbt project. It accepts selector arguments that are similar to those provided in [dbt run](/reference/commands/run). `dbt list` is an alias for `dbt ls`. While `dbt ls` will read your [connection profile](/docs/core/connect-data-platform/connection-profiles) to resolve [`target`](/reference/dbt-jinja-functions/target)-specific logic, this command will not connect to your database or run any queries. ### Usage + ``` dbt ls - [--resource-type {source,analysis,model,snapshot,test,seed,exposure,default,all}] + [--resource-type {model,semantic_model,source,seed,snapshot,metric,test,exposure,analysis,default,all}] [--select SELECTION_ARG [SELECTION_ARG ...]] [--models SELECTOR [SELECTOR ...]] [--exclude SELECTOR [SELECTOR ...]] - [--selector YML_SELECTOR_NAME [YML_SELECTOR_NAME ...]] + [--selector YML_SELECTOR_NAME] [--output {json,name,path,selector}] [--output-keys KEY_NAME [KEY_NAME]] ``` -See [resource selection syntax](node-selection/syntax) for more information on how to select resources in dbt +See [resource selection syntax](/reference/node-selection/syntax) for more information on how to select resources in dbt **Arguments**: -- `--resource-type`: This flag limits the "resource types" that dbt will return in the `dbt ls` command. By default, the following resources are included in the results of `dbt ls`: models, snapshots, seeds, tests, and sources. +- `--resource-type`: This flag restricts the "resource types" returned by dbt in the `dbt ls` command. By default, all resource types are included in the results of `dbt ls` except for the analysis type. - `--select`: This flag specifies one or more selection-type arguments used to filter the nodes returned by the `dbt ls` command - `--models`: Like the `--select` flag, this flag is used to select nodes. It implies `--resource-type=model`, and will only return models in the results of the `dbt ls` command. Supported for backwards compatibility only. - `--exclude`: Specify selectors that should be _excluded_ from the list of returned nodes. -- `--selector`: This flag specifies one or more named selectors, defined in a `selectors.yml` file. +- `--selector`: This flag specifies one named selector, defined in a `selectors.yml` file. - `--output`: This flag controls the format of output from the `dbt ls` command. - `--output-keys`: If `--output json`, this flag controls which node properties are included in the output. @@ -71,13 +72,39 @@ $ dbt ls --select snowplow.* --output json ``` **Listing JSON output with custom keys** + + + +``` +$ dbt ls --select snowplow.* --output json --output-keys "name resource_type description" +{"name": "snowplow_events", "description": "This is a pretty cool model", ...} +{"name": "snowplow_page_views", "description": "This model is even cooler", ...} +... +``` + + + + ``` -$ dbt ls --select snowplow.* --output json --output-keys name description +$ dbt ls --select snowplow.* --output json --output-keys "name resource_type description" {"name": "snowplow_events", "description": "This is a pretty cool model", ...} {"name": "snowplow_page_views", "description": "This model is even cooler", ...} ... ``` + + + + +**Listing Semantic models** + +List all resources upstream of your orders semantic model: +``` +dbt ls -s +semantic_model:orders +``` + + + **Listing file paths** ``` dbt ls --select snowplow.* --output path diff --git a/website/docs/reference/commands/parse.md b/website/docs/reference/commands/parse.md index 5139120de20..92e66d3f192 100644 --- a/website/docs/reference/commands/parse.md +++ b/website/docs/reference/commands/parse.md @@ -1,34 +1,24 @@ --- -title: "parse" +title: "About dbt parse command" +sidebar_label: "parse" +description: "Read this guide on how dbt's parse command can be used to parse your dbt project and write detailed timing information." id: "parse" --- - +The `dbt parse` command parses and validates the contents of your dbt project. If your project contains Jinja or YAML syntax errors, the command will fail. -New in `v0.19.0` +It will also produce an artifact with detailed timing information, which is useful to understand parsing times for large projects. Refer to [Project parsing](/reference/parsing) for more information. - + -## Overview +Starting in v1.5, `dbt parse` will write or return a [manifest](/reference/artifacts/manifest-json), enabling you to introspect dbt's understanding of all the resources in your project. -The `dbt parse` command parses your dbt project and writes detailed timing information. If your project contains Jinja or YAML syntax errors, the command will fail. + -### Usage ``` $ dbt parse -Running with dbt=0.19.0 -11:53:29 | Start parsing. -11:53:29 | Macro manifest loaded -11:53:29 | Dependencies loaded -11:53:29 | ManifestLoader created -11:53:29 | Manifest loaded -11:53:29 | Parse results written -11:53:29 | Manifest created -11:53:29 | Manifest checked -11:53:29 | Flat graph built -11:53:29 | Manifest loaded -11:53:29 | Performance info: target/perf_info.json -11:53:29 | Done. +13:02:52 Running with dbt=1.5.0 +13:02:53 Performance info: target/perf_info.json ``` diff --git a/website/docs/reference/commands/retry.md b/website/docs/reference/commands/retry.md new file mode 100644 index 00000000000..8da5d5a77a6 --- /dev/null +++ b/website/docs/reference/commands/retry.md @@ -0,0 +1,99 @@ +--- +title: "About dbt retry command" +sidebar_label: "retry" +id: "retry" +--- + +`dbt retry` re-executes the last `dbt` command from the node point of failure. If the previously executed `dbt` command was successful, `retry` will finish as `no operation`. + +Retry works with the following commands: + +- [`build`](/reference/commands/build) +- [`compile`](/reference/commands/compile) +- [`seed`](/reference/commands/seed) +- [`snapshot`](/reference/commands/build) +- [`test`](/reference/commands/test) +- [`run`](/reference/commands/run) +- [`run-operation`](/reference/commands/run-operation) + +`dbt retry` references [run_results.json](/reference/artifacts/run-results-json) to determine where to start. Executing `dbt retry` without correcting the previous failures will garner results. + +`dbt retry` reuses the [selectors](/reference/node-selection/yaml-selectors) from the previously executed command. + + +Example results of executing `dbt retry` after a successful `dbt run`: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Nothing to do. Try checking your model configs and model specification args +``` + +Example of when `dbt run` encounters a syntax error in a model: + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 5 START sql view model main.stg_customers ................................. [RUN] +2 of 5 START sql view model main.stg_orders .................................... [RUN] +3 of 5 START sql view model main.stg_payments .................................. [RUN] +1 of 5 OK created sql view model main.stg_customers ............................ [OK in 0.06s] +2 of 5 OK created sql view model main.stg_orders ............................... [OK in 0.06s] +3 of 5 OK created sql view model main.stg_payments ............................. [OK in 0.07s] +4 of 5 START sql table model main.customers .................................... [RUN] +5 of 5 START sql table model main.orders ....................................... [RUN] +4 of 5 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] +5 of 5 OK created sql table model main.orders .................................. [OK in 0.04s] + +Finished running 3 view models, 2 table models in 0 hours 0 minutes and 0.15 seconds (0.15s). + +Completed with 1 error and 0 warnings: + +Runtime Error in model customers (models/customers.sql) + Parser Error: syntax error at or near "selct" + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + + +Example of a subsequent failed `dbt retry` run without fixing the error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 ERROR creating sql table model main.customers ........................... [ERROR in 0.03s] + +Done. PASS=4 WARN=0 ERROR=1 SKIP=0 TOTAL=5 +``` + +Example of a successful `dbt retry` run after fixing error(s): + +```shell +Running with dbt=1.6.1 +Registered adapter: duckdb=1.6.0 +Found 5 models, 3 seeds, 20 tests, 0 sources, 0 exposures, 0 metrics, 348 macros, 0 groups, 0 semantic models + +Concurrency: 24 threads (target='dev') + +1 of 1 START sql table model main.customers .................................... [RUN] +1 of 1 OK created sql table model main.customers ............................... [OK in 0.05s] + +Finished running 1 table model in 0 hours 0 minutes and 0.09 seconds (0.09s). + +Completed successfully + +Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 +``` + +In each scenario `dbt retry` picks up from the error rather than running all of the upstream dependencies again. diff --git a/website/docs/reference/commands/rpc.md b/website/docs/reference/commands/rpc.md index e2d7c448956..809eadee639 100644 --- a/website/docs/reference/commands/rpc.md +++ b/website/docs/reference/commands/rpc.md @@ -1,25 +1,22 @@ --- -title: "rpc" +title: "About dbt rpc command" +sidebar_label: "rpc" id: "rpc" description: "Remote Procedure Call (rpc) dbt server compiles and runs queries, and provides methods that enable you to list and terminate running processes. " --- - +:::caution The dbt-rpc plugin is deprecated - - **v0.14**: The `dbt rpc` command was introduced to dbt Core - - **v1.0**: We now distribute and package the Remote Procedure Call (rpc) server functionality separately from `dbt-core`. You can find the code in a dedicated [`dbt-rpc` repository](https://github.com/dbt-labs/dbt-rpc). - +dbt Labs actively maintained `dbt-rpc` for compatibility with dbt-core versions up to v1.5. Starting with dbt-core v1.6 (released in July 2023), `dbt-rpc` is no longer supported for ongoing compatibility. -### Overview +In the meantime, dbt Labs will be performing critical maintenance only for `dbt-rpc`, until the last compatible version of dbt-core has reached the [end of official support](/docs/dbt-versions/core#latest-releases). At that point, dbt Labs will archive this repository to be read-only. -You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. +::: -:::caution Deprecation -**The dbt-rpc plugin will be fully deprecated by the end of 2022.** +### Overview -For now, dbt Labs actively maintains and uses `dbt-rpc` to enable interactive dbt development. Once we announce the next-generation dbt Server is available for general release, we will deprecate the legacy plugin and only fix critical issues for a period of six months. After six months, we will archive this repository for read-only use. -::: +You can use the `dbt-rpc` plugin to run a Remote Procedure Call (rpc) dbt server. This server compiles and runs queries in the context of a dbt project. Additionally, the RPC server provides methods that enable you to list and terminate running processes. We recommend running an rpc server from a directory containing a dbt project. The server will compile the project into memory, then accept requests to operate against that project's dbt context. :::caution Running on Windows We do not recommend running the rpc server on Windows because of reliability issues. A Docker container may provide a useful workaround, if required. @@ -238,13 +235,13 @@ All RPC requests accept the following parameters in addition to the parameters l ``` Several of the following request types accept these additional parameters: -- `threads`: The number of [threads](/docs/get-started/connection-profiles#understanding-threads) to use when compiling (optional) +- `threads`: The number of [threads](/docs/core/connect-data-platform/connection-profiles#understanding-threads) to use when compiling (optional) - `select`: The space-delimited set of resources to execute (optional). (`models` is also supported on some request types for backwards compatibility.) -- `selector`: The name of a predefined [YAML selector](node-selection/yaml-selectors) that defines the set of resources to execute (optional) +- `selector`: The name of a predefined [YAML selector](/reference/node-selection/yaml-selectors) that defines the set of resources to execute (optional) - `exclude`: The space-delimited set of resources to exclude from compiling, running, testing, seeding, or snapshotting (optional) -- `state`: The filepath of artifacts to use when establishing [state](understanding-state) (optional) +- `state`: The filepath of artifacts to use when establishing [state](/reference/node-selection/syntax#about-node-selection) (optional) -### Compile a project ([docs](compile)) +### Compile a project ([docs](/reference/commands/compile)) ```json { @@ -261,7 +258,7 @@ Several of the following request types accept these additional parameters: } ``` -### Run models ([docs](run)) +### Run models ([docs](/reference/commands/run)) **Additional parameters:** - `defer`: Whether to defer references to upstream, unselected resources (optional, requires `state`) @@ -282,7 +279,7 @@ Several of the following request types accept these additional parameters: } ``` -### Run tests ([docs](commands/test)) +### Run tests ([docs](/reference/commands/test)) **Additional parameters:** - `data`: If True, run data tests (optional, default=true) @@ -305,7 +302,7 @@ Several of the following request types accept these additional parameters: } ``` -### Run seeds ([docs](seed)) +### Run seeds ([docs](/reference/commands/seed)) **Parameters:** - `show`: If True, show a sample of the seeded data in the response (optional, default=false) @@ -343,7 +340,7 @@ Several of the following request types accept these additional parameters: } ``` -### Build ([docs](build)) +### Build ([docs](/reference/commands/build)) ```json { @@ -370,7 +367,7 @@ Several of the following request types accept these additional parameters: ```json { "jsonrpc": "2.0", - "method": "build", + "method": "ls", "id": "", "params": { "select": " (optional)", @@ -447,9 +444,9 @@ This query executes the sql `select {{ 1 + 1 }} as id` (bas64-encoded) against t The resulting response will include a key called `table` with a value of `{'column_names': ['?column?'], 'rows': [[2.0]]}` -## Reloading the Server +## Reloading the RPC Server -When the dbt Server starts, it will load the dbt project into memory using the files present on disk at startup. If the files in the dbt project should change (either during development or in a deployment), the dbt Server can be updated live without cycling the server process. To reload the files present on disk, send a "hangup" signal to the running server process using the Process ID (pid) of the running process. +When the dbt RPC Server starts, it will load the dbt project into memory using the files present on disk at startup. If the files in the dbt project should change (either during development or in a deployment), the dbt RPC Server can be updated live without cycling the server process. To reload the files present on disk, send a "hangup" signal to the running server process using the Process ID (pid) of the running process. ### Finding the server PID diff --git a/website/docs/reference/commands/run-operation.md b/website/docs/reference/commands/run-operation.md index 95b4f17b534..6b842b3064c 100644 --- a/website/docs/reference/commands/run-operation.md +++ b/website/docs/reference/commands/run-operation.md @@ -1,15 +1,17 @@ --- -title: "run-operation" +title: "About dbt run-operation command" +sidebar_label: "run-operation" +description: "Read this guide on how dbt's run-operation command can be used to invoke a macro." id: "run-operation" --- ### Overview -The `dbt run-operation` command is used to invoke a macro. For usage information, consult the docs on [operations](hooks-operations#operations). +The `dbt run-operation` command is used to invoke a macro. For usage information, consult the docs on [operations](/docs/build/hooks-operations#operations). ### Usage ``` -$ dbt run-operation {macro} --args {args} +$ dbt run-operation {macro} --args '{args}' {macro} Specify the macro to invoke. dbt will call this macro with the supplied arguments and then exit --args ARGS Supply arguments to the macro. This dictionary will be @@ -17,3 +19,12 @@ $ dbt run-operation {macro} --args {args} selected macro. This argument should be a YAML string, eg. '{my_variable: my_value}' ``` +### Command line examples + +Example 1: + +`$ dbt run-operation grant_select --args '{role: reporter}'` + +Example 2: + +`$ dbt run-operation clean_stale_models --args '{days: 7, dry_run: True}'` diff --git a/website/docs/reference/commands/run.md b/website/docs/reference/commands/run.md index 0b775a157f0..557d0d71338 100644 --- a/website/docs/reference/commands/run.md +++ b/website/docs/reference/commands/run.md @@ -1,5 +1,7 @@ --- -title: "run" +title: "About dbt run command" +sidebar_label: "run" +description: "Read this guide on how dbt's run command can be used to execute compiled SQL model files against a target database." id: "run" --- @@ -40,7 +42,7 @@ You can also supply the flag by its short name: `dbt run -f`. -In the dbt compilation context, this flag will be available as [flags.FULL_REFRESH](flags). Further, the `is_incremental()` macro will return `false` for *all* models in response when the `--full-refresh` flag is specified. +In the dbt compilation context, this flag will be available as [flags.FULL_REFRESH](/reference/dbt-jinja-functions/flags). Further, the `is_incremental()` macro will return `false` for *all* models in response when the `--full-refresh` flag is specified. @@ -63,36 +65,18 @@ select * from all_events dbt will also allow you select which specific models you'd like to materialize. This can be useful during special scenarios where you may prefer running a different set of models at various intervals. This can also be helpful when you may want to limit the tables materialized while you develop and test new models. -For more information, see the [Model Selection Syntax Documentation](node-selection/syntax). +For more information, see the [Model Selection Syntax Documentation](/reference/node-selection/syntax). -## Treat warnings as errors - - - -- Moved to [global configs](global-configs) in v1.0 +For more information on running parents or children of specific models, see the [Graph Operators Documentation](/reference/node-selection/graph-operators). - +## Treat warnings as errors -See [global configs](global-configs#failing-fast) +See [global configs](/reference/global-configs/warnings) ## Failing fast - - -- The `--fail-fast` flag is new in dbt v0.17.0 -- Moved to [global configs](global-configs) in v1.0 - - - -See [global configs](global-configs#failing-fast) +See [global configs](/reference/global-configs/failing-fast) ## Enable or Disable Colorized Logs - - -- The `--use-colors` and `--no-use-colors` flags are new in dbt v0.18.0 -- Moved to [global configs](global-configs) in v1.0 - - - -See [global configs](global-configs#use-colors) +See [global configs](/reference/global-configs/print-output#print-color) diff --git a/website/docs/reference/commands/seed.md b/website/docs/reference/commands/seed.md index e60ceced0d3..d0cd199ea12 100644 --- a/website/docs/reference/commands/seed.md +++ b/website/docs/reference/commands/seed.md @@ -1,26 +1,18 @@ --- -title: "seed" +title: "About dbt seed command" +sidebar_label: "seed" id: "seed" --- - - -- **v1.0.0:** The default config for this command will now be `seed-paths` instead of `data-paths`. - - - - The `dbt seed` command will load `csv` files located in the `seed-paths` directory of your dbt project into your . ### Selecting seeds to run - Added in v0.16.0 - Specific seeds can be run using the `--select` flag to `dbt seed`. Example: ``` -$ dbt seed --select country_codes +$ dbt seed --select "country_codes" Found 2 models, 3 tests, 0 archives, 0 analyses, 53 macros, 0 operations, 2 seed files 14:46:15 | Concurrency: 1 threads (target='dev') diff --git a/website/docs/reference/commands/show.md b/website/docs/reference/commands/show.md new file mode 100644 index 00000000000..a0e5d68c83f --- /dev/null +++ b/website/docs/reference/commands/show.md @@ -0,0 +1,72 @@ +--- +title: "About dbt show command" +sidebar_label: "show" +id: "show" +--- + +Use `dbt show` to: +- Compile the dbt-SQL definition of a `model`, `test`, `analysis`, or an arbitrary dbt-SQL query passed `--inline` +- Run that query against the data warehouse +- Preview the results in the terminal + +By default, `dbt show` will display the first 5 rows from the query result. This can be customized by passing the flag `--limit n`, where `n` is the number of rows to display. + +The results of the preview query are not materialized in the data warehouse, or stored in any dbt file. They are only included in dbt's logs and displayed in the terminal. Note also that, if previewing a model, dbt will always compile and run the compiled query from source. It will not select from the already-materialized database relation, even if you've just run the model. (We may support that in the future; if you're interested, upvote or comment on [dbt-core#7391](https://github.com/dbt-labs/dbt-core/issues/7391).) + +Example: + +``` +dbt show --select "model_name.sql" +``` +or +``` +dbt show --inline "select * from {{ ref('model_name') }}" +``` + +The following is an example of `dbt show` output for a model named `stg_orders`: + +```bash +dbt show --select "stg_orders" +21:17:38 Running with dbt=1.5.0-b5 +21:17:38 Found 5 models, 20 tests, 0 snapshots, 0 analyses, 425 macros, 0 operations, 3 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups +21:17:38 +21:17:38 Concurrency: 24 threads (target='dev') +21:17:38 +21:17:38 Previewing node 'stg_orders' : +| order_id | customer_id | order_date | status | +|----------+-------------+------------+-------- | +| 1 | 1 | 2023-01-01 | returned | +| 2 | 3 | 2023-01-02 | completed | +| 3 | 94 | 2023-01-03 | completed | +| 4 | 50 | 2023-01-04 | completed | +| 5 | 64 | 2023-01-05 | completed | + +``` + +For example, if you've just built a model that has a failing test, you can quickly preview the test failures right in the terminal, to find values of `id` that are duplicated: + +```bash +$ dbt build -s "my_model_with_duplicates" +13:22:47 Running with dbt=1.5.0 +... +13:22:48 Completed with 1 error and 0 warnings: +13:22:48 +13:22:48 Failure in test unique_my_model_with_duplicates (models/schema.yml) +13:22:48 Got 1 result, configured to fail if not 0 +13:22:48 +13:22:48 compiled code at target/compiled/my_dbt_project/models/schema.yml/unique_my_model_with_duplicates_id.sql +13:22:48 +13:22:48 Done. PASS=1 WARN=0 ERROR=1 SKIP=0 TOTAL=2 + +$ dbt show -s "unique_my_model_with_duplicates_id" +13:22:53 Running with dbt=1.5.0 +13:22:53 Found 4 models, 2 tests, 0 snapshots, 0 analyses, 309 macros, 0 operations, 0 seed files, 0 sources, 0 exposures, 0 metrics, 0 groups +13:22:53 +13:22:53 Concurrency: 5 threads (target='dev') +13:22:53 +13:22:53 Previewing node 'unique_my_model_with_duplicates_id': +| unique_field | n_records | +| ------------ | --------- | +| 1 | 2 | + +``` diff --git a/website/docs/reference/commands/snapshot.md b/website/docs/reference/commands/snapshot.md index 9349a1469ec..c6011a6e169 100644 --- a/website/docs/reference/commands/snapshot.md +++ b/website/docs/reference/commands/snapshot.md @@ -1,9 +1,10 @@ --- -title: "snapshot" +title: "About dbt snapshot command" +sidebar_label: "snapshot" id: "snapshot" --- -The `dbt snapshot` command executes the [Snapshots](snapshots) defined in your project. +The `dbt snapshot` command executes the [Snapshots](/docs/build/snapshots) defined in your project. dbt will looks for Snapshots in the `snapshot-paths` paths defined in your `dbt_project.yml` file. By default, the `snapshot-paths` path is `snapshots/`. @@ -22,4 +23,4 @@ optional arguments: Specify the snapshots to include in the run. --exclude EXCLUDE [EXCLUDE ...] Specify the snapshots to exclude in the run. -``` \ No newline at end of file +``` diff --git a/website/docs/reference/commands/source.md b/website/docs/reference/commands/source.md index 230b92669a1..697ae2b5fcc 100644 --- a/website/docs/reference/commands/source.md +++ b/website/docs/reference/commands/source.md @@ -1,40 +1,29 @@ --- -title: "source" +title: "About dbt source command" +sidebar_label: "source" id: "source" --- The `dbt source` command provides subcommands that are useful when working with source data. This command provides one subcommand, `dbt source freshness`. :::info -If you're using an older version of dbt Core (before v0.21), the old name of the `freshness` subcommand was `snapshot-freshness`. (It has nothing to do with [snapshots](snapshots), which is why we renamed it.) Each time you see the command below, you'll need to specify it as `dbt source snapshot-freshness` instead of `dbt source freshness`. +If you're using an older version of dbt Core (before v0.21), the old name of the `freshness` subcommand was `snapshot-freshness`. (It has nothing to do with [snapshots](/docs/build/snapshots), which is why we renamed it.) Each time you see the command below, you'll need to specify it as `dbt source snapshot-freshness` instead of `dbt source freshness`. ::: ### dbt source freshness - - - - **v0.21.0:** Renamed `dbt source snapshot-freshness` to `dbt source freshness`. If using an older version of dbt, the command is `snapshot-freshness`. - - - If your dbt project is [configured with sources](/docs/build/sources), then the `dbt source freshness` command will query all of your defined source tables, determining the "freshness" of these tables. If the tables are stale (based on the `freshness` config specified for your sources) then dbt will report a warning or error accordingly. If a source is in a stale state, then dbt will exit with a nonzero exit code. ### Specifying sources to snapshot - - - - **v0.21.0:** Selection syntax for the `freshness` task now mirrors other tasks. Sources need to be prefixed with the `source:` selection method. In previous versions of dbt, sources were specified by name only. - - - By default, `dbt source freshness` will calculate freshness information for all of the sources in your project. To snapshot freshness for a subset of these sources, use the `--select` flag. ```bash # Snapshot freshness for all Snowplow tables: -$ dbt source freshness --select source:snowplow +$ dbt source freshness --select "source:snowplow" # Snapshot freshness for a particular source table: -$ dbt source freshness --select source:snowplow.event +$ dbt source freshness --select "source:snowplow.event" ``` ### Configuring source freshness output diff --git a/website/docs/reference/commands/test.md b/website/docs/reference/commands/test.md index 27d6f62e260..c050d82a0ab 100644 --- a/website/docs/reference/commands/test.md +++ b/website/docs/reference/commands/test.md @@ -1,30 +1,31 @@ --- -title: "test" +title: "About dbt test command" +sidebar_label: "test" id: "test" --- `dbt test` runs tests defined on models, sources, snapshots, and seeds. It expects that you have already created those resources through the appropriate commands. -The tests to run can be selected using the `--select` flag discussed [here](node-selection/syntax). +The tests to run can be selected using the `--select` flag discussed [here](/reference/node-selection/syntax). ```bash # run tests for one_specific_model -dbt test --select one_specific_model +dbt test --select "one_specific_model" # run tests for all models in package -dbt test --select some_package.* +dbt test --select "some_package.*" # run only tests defined singularly -dbt test --select test_type:singular +dbt test --select "test_type:singular" # run only tests defined generically -dbt test --select test_type:generic +dbt test --select "test_type:generic" # run singular tests limited to one_specific_model -dbt test --select one_specific_model,test_type:singular +dbt test --select "one_specific_model,test_type:singular" # run generic tests limited to one_specific_model -dbt test --select one_specific_model,test_type:generic +dbt test --select "one_specific_model,test_type:generic" ``` For more information on writing tests, see the [Testing Documentation](/docs/build/tests). diff --git a/website/docs/reference/configs-and-properties.md b/website/docs/reference/configs-and-properties.md index 436f06f704d..c2ad5b77629 100644 --- a/website/docs/reference/configs-and-properties.md +++ b/website/docs/reference/configs-and-properties.md @@ -2,10 +2,6 @@ title: Configs, properties, what are they? --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure certain resource types in all `.yml` files - - Resources in your project—models, snapshots, seeds, tests, and the rest—can have a number of declared **properties**. Resources can also define **configurations**, which are a special kind of property that bring extra abilities. What's the distinction? - Properties are declared for resources one-by-one in `.yml` files. Configs can be defined there, nested under a `config` property. They can also be set one-by-one via a `config()` macro (right within `.sql` files), and for many resources at once in `dbt_project.yml`. - Because configs can be set in multiple places, they are also applied hierarchically. An individual resource might _inherit_ or _override_ configs set elsewhere. @@ -17,7 +13,7 @@ For example, you can use resource **properties** to: * Describe models, snapshots, seed files, and their columns - Assert "truths" about a model, in the form of [tests](/docs/build/tests), e.g. "this `id` column is unique" * Define pointers to existing tables that contain raw data, in the form of [sources](/docs/build/sources), and assert the expected "freshness" of this raw data -* Define official downstream uses of your data models, in the form of [exposures](exposures) +* Define official downstream uses of your data models, in the form of [exposures](/docs/build/exposures) Whereas you can use **configurations** to: * Change how a model will be materialized (, , incremental, etc) @@ -30,14 +26,14 @@ Whereas you can use **configurations** to: Depending on the resource type, configurations can be defined: 1. Using a [`config()` Jinja macro](/reference/dbt-jinja-functions/config) within a `model`, `snapshot`, or `test` SQL file -2. Using a [`config` property](resource-properties/config) in a `.yml` file +2. Using a [`config` property](/reference/resource-properties/config) in a `.yml` file 3. From the [`dbt_project.yml` file](dbt_project.yml), under the corresponding resource key (`models:`, `snapshots:`, `tests:`, etc) ### Config inheritance dbt prioritizes configurations in order of specificity, from most specificity to least specificity. This generally follows the order above: an in-file `config()` block --> properties defined in a `.yml` file --> config defined in the project file. -Note - Generic tests work a little differently when it comes to specificity. See [test configs](test-configs). +Note - Generic tests work a little differently when it comes to specificity. See [test configs](/reference/test-configs). Within the project file, configurations are also applied hierarchically. The most-specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys. @@ -45,8 +41,8 @@ Within the project file, configurations are also applied hierarchically. The mos Most configurations are "clobbered" when applied hierarchically. Whenever a more-specific value is available, it will completely replace the less-specific value. Note that a few configs have different merge behavior: - [`tags`](tags) are additive. If a model has some tags configured in `dbt_project.yml`, and more tags applied in its `.sql` file, the final set of tags will include all of them. -- [`meta`](meta) dictionaries are merged (a more specific key-value pair replaces a less specific value with the same key) -- [`pre-hook` and `post-hook`](pre-hook-post-hook) are also additive. +- [`meta`](/reference/resource-configs/meta) dictionaries are merged (a more specific key-value pair replaces a less specific value with the same key) +- [`pre-hook` and `post-hook`](/reference/resource-configs/pre-hook-post-hook) are also additive. ## Where can I define properties? @@ -68,7 +64,7 @@ Previous versions of the docs referred to these as `schema.yml` files — we've ### Which properties are _not_ also configs? -In v0.21, dbt added the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there. +dbt has the ability to define node configs in `.yml` files, in addition to `config()` blocks and `dbt_project.yml`. But the reverse isn't always true: there are some things in `.yml` files that can _only_ be defined there. Certain properties are special, because: - They have a unique Jinja rendering context @@ -77,14 +73,14 @@ Certain properties are special, because: - They're older properties that haven't yet been redefined as configs These properties are: -- [`description`](resource-properties/description) -- [`tests`](resource-properties/tests) +- [`description`](/reference/resource-properties/description) +- [`tests`](/reference/resource-properties/tests) - [`docs`](/reference/resource-configs/docs) -- [`columns`](resource-properties/columns) -- [`quote`](resource-properties/quote) -- [`source` properties](source-properties) (e.g. `loaded_at_field`, `freshness`) -- [`exposure` properties](exposure-properties) (e.g. `type`, `maturity`) -- [`macro` properties](macro-properties) (e.g. `arguments`) +- [`columns`](/reference/resource-properties/columns) +- [`quote`](/reference/resource-properties/quote) +- [`source` properties](/reference/source-properties) (e.g. `loaded_at_field`, `freshness`) +- [`exposure` properties](/reference/exposure-properties) (e.g. `type`, `maturity`) +- [`macro` properties](/reference/macro-properties) (e.g. `arguments`) ## Example Here's an example that defines both `sources` and `models` for a project: @@ -156,21 +152,21 @@ models: ## Related documentation You can find an exhaustive list of each supported property and config, broken down by resource type: -* Model [properties](model-properties) and [configs](model-configs) -* Source [properties](source-properties) and [configs](source-configs) -* Seed [properties](seed-properties) and [configs](seed-configs) +* Model [properties](/reference/model-properties) and [configs](/reference/model-configs) +* Source [properties](/reference/source-properties) and [configs](source-configs) +* Seed [properties](/reference/seed-properties) and [configs](/reference/seed-configs) * [Snapshot Properties](snapshot-properties) * Analysis [properties](analysis-properties) -* [Macro Properties](macro-properties) -* Exposure [properties](exposure-properties) +* [Macro Properties](/reference/macro-properties) +* Exposure [properties](/reference/exposure-properties) ## FAQs - - - - - - + + + + + + ## Troubleshooting common errors @@ -205,4 +201,4 @@ Runtime Error ``` -This error occurred because a semicolon (`;`) was accidentally used instead of a colon (`:`) after the `description` field. To resolve issues like this, find the `.yml` file referenced in the error message and fix any syntax errors present in the file. There are online yaml validators that can be helpful here, but please be mindful of submitting sensitive information to third-party applications! +This error occurred because a semicolon (`;`) was accidentally used instead of a colon (`:`) after the `description` field. To resolve issues like this, find the `.yml` file referenced in the error message and fix any syntax errors present in the file. There are online YAML validators that can be helpful here, but please be mindful of submitting sensitive information to third-party applications! diff --git a/website/docs/reference/dbt-classes.md b/website/docs/reference/dbt-classes.md index 20e7637ba7a..13f9263e545 100644 --- a/website/docs/reference/dbt-classes.md +++ b/website/docs/reference/dbt-classes.md @@ -8,9 +8,10 @@ These classes are often useful when building advanced dbt models and macros. ## Relation -The `Relation` object is used to interpolate schema and names into SQL code with appropriate quoting. This object should _always_ be used instead of interpolating values with `{{ schema }}.{{ table }}` directly. Quoting of the Relation object can be configured using the [`quoting` config][quoting]. +The `Relation` object is used to interpolate schema and names into SQL code with appropriate quoting. This object should _always_ be used instead of interpolating values with `{{ schema }}.{{ table }}` directly. Quoting of the Relation object can be configured using the [`quoting` config](/reference/project-configs/quoting). -### Creating Relations + +### Creating relations A `Relation` can be created by calling the `create` class method on the `Relation` class. @@ -30,9 +31,10 @@ class Relation: -### Using Relations +### Using relations - +In addition to `api.Relation.create`, dbt returns a Relation when you use [`ref`](/reference/dbt-jinja-functions/ref), [`source`](/reference/dbt-jinja-functions/source) or [`this`](/reference/dbt-jinja-functions/this). + ```jinja2 {% set relation = api.Relation.create(schema='snowplow', identifier='events') %} @@ -52,7 +54,7 @@ class Relation: -- Return true if the relation is a table {{ relation.is_table }} --- Return true if the relation is a +-- Return true if the relation is a view {{ relation.is_view }} -- Return true if the relation is a cte @@ -79,13 +81,15 @@ class Column(object): """ -# Example Usage: +# Example usage: col = Column('name', 'varchar', 255) col.is_string() # True col.is_numeric() # False col.is_number() # False +col.is_integer() # False col.is_float() # False -col.string_type() # character varying (255) +col.string_type() # character varying(255) +col.numeric_type('numeric', 12, 4) # numeric(12,4) ``` @@ -100,23 +104,70 @@ col.string_type() # character varying (255) ### Instance methods - - - The `is_number` and `is_float` instance methods were added dbt v0.16.0 - - - - **is_string()**: Returns True if the column is a String type (eg. text, varchar), else False - **is_numeric()**: Returns True if the column is a fixed-precision Numeric type (eg. `numeric`), else False - **is_number()**: Returns True if the column is a number-y type (eg. `numeric`, `int`, `float`, or similar), else False +- **is_integer()**: Returns True if the column is an integer (eg. `int`, `bigint`, `serial` or similar), else False - **is_float()**: Returns True if the column is a float type (eg. `float`, `float64`, or similar), else False - **string_size()**: Returns the width of the column if it is a string type, else, an exception is raised ### Static methods - **string_type(size)**: Returns a database-useable representation of the string type (eg. `character varying(255)`) -- **numeric_type(dtype, size)**: Returns a database-useable representation of the numeric type (eg. `numeric(12, 4)`) +- **numeric_type(dtype, precision, scale)**: Returns a database-useable representation of the numeric type (eg. `numeric(12, 4)`) + +### Using columns + + + +```jinja2 +-- String column +{%- set string_column = api.Column('name', 'varchar', char_size=255) %} + +-- Return true if the column is a string +{{ string_column.is_string() }} + +-- Return true if the column is a numeric +{{ string_column.is_numeric() }} + +-- Return true if the column is a number +{{ string_column.is_number() }} + +-- Return true if the column is an integer +{{ string_column.is_integer() }} + +-- Return true if the column is a float +{{ string_column.is_float() }} + +-- Numeric column +{%- set numeric_column = api.Column('distance_traveled', 'numeric', numeric_precision=12, numeric_scale=4) %} + +-- Return true if the column is a string +{{ numeric_column.is_string() }} -## BigQuery Columns +-- Return true if the column is a numeric +{{ numeric_column.is_numeric() }} + +-- Return true if the column is a number +{{ numeric_column.is_number() }} + +-- Return true if the column is an integer +{{ numeric_column.is_integer() }} + +-- Return true if the column is a float +{{ numeric_column.is_float() }} + +-- Static methods + +-- Return the string data type for this database adapter with a given size +{{ api.Column.string_type(255) }} + +-- Return the numeric data type for this database adapter with a given precision and scale +{{ api.Column.numeric_type('numeric', 12, 4) }} +``` + + + +## BigQuery columns The `Column` type is overridden as a `BigQueryColumn` in BigQuery dbt projects. This object works the same as the `Column` type described above, with the exception of extra properties and methods: ### Properties @@ -137,18 +188,15 @@ will be expanded to: ## Result objects - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - -The execution of a resource in dbt generates a `Result` object. This object contains information about the executed node, timing, status, and metadata returned by the adapter. At the end of an invocation, dbt records these objects in [`run_results.json`](run-results-json). +The execution of a resource in dbt generates a `Result` object. This object contains information about the executed node, timing, status, and metadata returned by the adapter. At the end of an invocation, dbt records these objects in [`run_results.json`](/reference/artifacts/run-results-json). - `node`: Full object representation of the dbt resource (model, seed, snapshot, test) executed, including its `unique_id` - `status`: dbt's interpretation of runtime success, failure, or error - `thread_id`: Which thread executed this node? E.g. `Thread-1` -- `execution_time`: Total time spent executing this node +- `execution_time`: Total time spent executing this node, measured in seconds. - `timing`: Array that breaks down execution time into steps (often `compile` + `execute`) -- `adapter_response`: Dictionary of metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. - `message`: How dbt will report this result on the CLI, based on information returned from the database + +import RowsAffected from '/snippets/_run-result.md'; + + diff --git a/website/docs/reference/dbt-commands.md b/website/docs/reference/dbt-commands.md index 37f3d234837..1448d9849d3 100644 --- a/website/docs/reference/dbt-commands.md +++ b/website/docs/reference/dbt-commands.md @@ -2,30 +2,97 @@ title: "dbt Command reference" --- -dbt is typically run one of two ways: -* In [dbt Cloud](/docs/get-started/develop-in-the-cloud) -* On the command line - -The following sections outline the commands supported by dbt and their relevant flags. Note that some commands are only supported when using the CLI. - -For information about selecting models on the command line, consult the docs on [Model selection syntax](node-selection/syntax). - -**Available commands:** - -- [build](build): build and test all selected resources (models, seeds, snapshots, tests) -- [clean](clean) (CLI only): deletes artifacts present in the dbt project -- [compile](compile): compiles (but does not run) the models in a project -- [debug](debug) (CLI only): debugs dbt connections and projects -- [deps](deps): downloads dependencies for a project -- [docs](cmd-docs) : generates documentation for a project -- [init](init) (CLI only): initializes a new dbt project -- [list](list) (CLI only): lists resources defined in a dbt project -- [parse](parse) (CLI only): parses a project and writes detailed timing info -- [run](run): runs the models in a project -- [seed](seed): loads CSV files into the database -- [snapshot](snapshot): executes "snapshot" jobs defined in a project -- [source](commands/source): provides tools for working with source data (including validating that sources are "fresh") -- [test](commands/test): executes tests defined in a project -- [rpc](rpc) (CLI only): runs an RPC server that clients can submit queries to -- [run-operation](run-operation): runs arbitrary maintenance SQL against the database +You can run dbt using the following tools: +- In your browser with the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) +- On the command line interface using the [dbt Cloud CLI](/docs/cloud/cloud-cli-installation) or open-source [dbt Core](/docs/core/about-dbt-core), both of which enable you to execute dbt commands. The key distinction is the dbt Cloud CLI is tailored for dbt Cloud's infrastructure and integrates with all its [features](/docs/cloud/about-cloud/dbt-cloud-features). + +The following sections outline the commands supported by dbt and their relevant flags. For information about selecting models on the command line, consult the docs on [Model selection syntax](/reference/node-selection/syntax). + +### Available commands + + + +All commands in the table are compatible with either the dbt Cloud IDE, dbt Cloud CLI, or dbt Core. + +You can run dbt commands in your specific tool by prefixing them with `dbt`. For example, to run the `test` command, type `dbt test`. + +| Command | Description | Compatible tools | Version | +| ------- | ----------- | ---------------- | ------- | +| [build](/reference/commands/build) | Build and test all selected resources (models, seeds, snapshots, tests) | All | All [supported versions](/docs/dbt-versions/core) | +| cancel | Cancels the most recent invocation.| dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [clean](/reference/commands/clean) | Deletes artifacts present in the dbt project | All | All [supported versions](/docs/dbt-versions/core) | +| [clone](/reference/commands/clone) | Clone selected models from the specified state | dbt Cloud CLI
          dbt Core | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [compile](/reference/commands/compile) | Compiles (but does not run) the models in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [debug](/reference/commands/debug) | Debugs dbt connections and projects | dbt Core | All [supported versions](/docs/dbt-versions/core) | +| [deps](/reference/commands/deps) | Downloads dependencies for a project | All | All [supported versions](/docs/dbt-versions/core) | +| [docs](/reference/commands/cmd-docs) | Generates documentation for a project | All | All [supported versions](/docs/dbt-versions/core) | +| help | Displays help information for any command | dbt Core
          dbt Cloud CLI | All [supported versions](/docs/dbt-versions/core) | +| [list](/reference/commands/list) | Lists resources defined in a dbt project | All | All [supported versions](/docs/dbt-versions/core) | +| [parse](/reference/commands/parse) | Parses a project and writes detailed timing info | All | All [supported versions](/docs/dbt-versions/core) | +| reattach | Reattaches to the most recent invocation to retrieve logs and artifacts. | dbt Cloud CLI | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [retry](/reference/commands/retry) | Retry the last run `dbt` command from the point of failure | All | Requires [dbt v1.6 or higher](/docs/dbt-versions/core) | +| [run](/reference/commands/run) | Runs the models in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [run-operation](/reference/commands/run-operation) | Invoke a macro, including running arbitrary maintenance SQL against the database | All | All [supported versions](/docs/dbt-versions/core) | +| [seed](/reference/commands/seed) | Loads CSV files into the database | All | All [supported versions](/docs/dbt-versions/core) | +| [show](/reference/commands/show) | Preview table rows post-transformation | All | All [supported versions](/docs/dbt-versions/core) | +| [snapshot](/reference/commands/snapshot) | Executes "snapshot" jobs defined in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [source](/reference/commands/source) | Provides tools for working with source data (including validating that sources are "fresh") | All | All [supported versions](/docs/dbt-versions/core) | +| [test](/reference/commands/test) | Executes tests defined in a project | All | All [supported versions](/docs/dbt-versions/core) | +| [init](/reference/commands/init) | Initializes a new dbt project | dbt Core | All [supported versions](/docs/dbt-versions/core) | + +
          + + + +Select the tabs that are relevant to your development workflow. For example, if you develop in the dbt Cloud IDE, select **dbt Cloud**. + + + + +Use the following dbt commands in the [dbt Cloud IDE](/docs/cloud/dbt-cloud-ide/develop-in-the-cloud) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. + +- [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests) +- [clone](/reference/commands/clone): clone selected nodes from the specified state (requires dbt 1.6 or higher) +- [compile](/reference/commands/compile): compiles (but does not run) the models in a project +- [deps](/reference/commands/deps): downloads dependencies for a project +- [docs](/reference/commands/cmd-docs) : generates documentation for a project +- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or later) +- [run](/reference/commands/run): runs the models in a project +- [run-operation](/reference/commands/run-operation): invoke a macro, including running arbitrary maintenance SQL against the database +- [seed](/reference/commands/seed): loads CSV files into the database +- [show](/reference/commands/show): preview table rows post-transformation +- [snapshot](/reference/commands/snapshot): executes "snapshot" jobs defined in a project +- [source](/reference/commands/source): provides tools for working with source data (including validating that sources are "fresh") +- [test](/reference/commands/test): executes tests defined in a project + + + + + +Use the following dbt commands in [dbt Core](/docs/core/about-dbt-core) and use the `dbt` prefix. For example, to run the `test` command, type `dbt test`. + +- [build](/reference/commands/build): build and test all selected resources (models, seeds, snapshots, tests) +- [clean](/reference/commands/clean): deletes artifacts present in the dbt project +- [clone](/reference/commands/clone): clone selected models from the specified state (requires dbt 1.6 or higher) +- [compile](/reference/commands/compile): compiles (but does not run) the models in a project +- [debug](/reference/commands/debug): debugs dbt connections and projects +- [deps](/reference/commands/deps): downloads dependencies for a project +- [docs](/reference/commands/cmd-docs) : generates documentation for a project +- [init](/reference/commands/init): initializes a new dbt project +- [list](/reference/commands/list): lists resources defined in a dbt project +- [parse](/reference/commands/parse): parses a project and writes detailed timing info +- [retry](/reference/commands/retry): retry the last run `dbt` command from the point of failure (requires dbt 1.6 or higher) +- [rpc](/reference/commands/rpc): runs an RPC server that clients can submit queries to +- [run](/reference/commands/run): runs the models in a project +- [run-operation](/reference/commands/run-operation): invoke a macro, including running arbitrary maintenance SQL against the database +- [seed](/reference/commands/seed): loads CSV files into the database +- [show](/reference/commands/show): preview table rows post-transformation +- [snapshot](/reference/commands/snapshot): executes "snapshot" jobs defined in a project +- [source](/reference/commands/source): provides tools for working with source data (including validating that sources are "fresh") +- [test](/reference/commands/test): executes tests defined in a project + + + + + diff --git a/website/docs/reference/dbt-jinja-functions/adapter.md b/website/docs/reference/dbt-jinja-functions/adapter.md index 1c11d1c8ff7..fdb630e8717 100644 --- a/website/docs/reference/dbt-jinja-functions/adapter.md +++ b/website/docs/reference/dbt-jinja-functions/adapter.md @@ -1,15 +1,19 @@ --- -title: "adapter" +title: "About adapter object" +sidebar_label: "adapter" id: "adapter" +description: "Wrap the internal database adapter with the Jinja object `adapter`." --- -## Overview +Your database communicates with dbt using an internal database adapter object. For example, BaseAdapter and SnowflakeAdapter. The Jinja object `adapter` is a wrapper around this internal database adapter object. -`adapter` is a wrapper around the internal database adapter used by dbt. It allows users to make calls to the database in their dbt models. The adapter methods below will be translated into specific SQL statements depending on the type of adapter your project is using. +`adapter` grants the ability to invoke adapter methods of that internal class via: +* `{% do adapter. %}` -- invoke internal adapter method +* `{{ adapter. }}` -- invoke internal adapter method and capture its return value for use in materialization or other macros -The following functions are available: +For example, the adapter methods below will be translated into specific SQL statements depending on the type of adapter your project is using: -- [adapter.dispatch](dispatch) +- [adapter.dispatch](/reference/dbt-jinja-functions/dispatch) - [adapter.get_missing_columns](#get_missing_columns) - [adapter.expand_target_column_types](#expand_target_column_types) - [adapter.get_relation](#get_relation) or [load_relation](#load_relation) @@ -29,15 +33,15 @@ The following adapter functions are deprecated, and will be removed in a future ## dispatch -Moved to separate page: [dispatch](dispatch) +Moved to separate page: [dispatch](/reference/dbt-jinja-functions/dispatch) ## get_missing_columns __Args__: - * `from_relation`: The source [Relation](dbt-classes#relation) - * `to_relation`: The target [Relation](dbt-classes#relation) + * `from_relation`: The source [Relation](/reference/dbt-classes#relation) + * `to_relation`: The target [Relation](/reference/dbt-classes#relation) -Returns a list of [Columns](dbt-classes#column) that is the difference of the columns in the `from_table` +Returns a list of [Columns](/reference/dbt-classes#column) that is the difference of the columns in the `from_table` and the columns in the `to_table`, i.e. (`set(from_relation.columns) - set(to_table.columns)`). Useful for detecting new columns in a source . @@ -62,8 +66,8 @@ Useful for detecting new columns in a source . ## expand_target_column_types __Args__: - * `from_relation`: The source [Relation](dbt-classes#relation) to use as a template - * `to_relation`: The [Relation](dbt-classes#relation) to mutate + * `from_relation`: The source [Relation](/reference/dbt-classes#relation) to use as a template + * `to_relation`: The [Relation](/reference/dbt-classes#relation) to mutate Expand the `to_relation` 's column types to match the schema of `from_relation`. Column expansion is constrained to string and numeric types on supported databases. Typical usage involves expanding column types (from eg. `varchar(16)` to `varchar(32)`) to support insert statements. @@ -88,7 +92,7 @@ __Args__: * `schema`: The schema of the relation to fetch * `identifier`: The identifier of the relation to fetch -Returns a cached [Relation](dbt-classes#relation) object identified by the `database.schema.identifier` provided to the method, or `None` if the relation does not exist. +Returns a cached [Relation](/reference/dbt-classes#relation) object identified by the `database.schema.identifier` provided to the method, or `None` if the relation does not exist. **Usage**: @@ -110,9 +114,9 @@ Returns a cached [Relation](dbt-classes#relation) object identified by the `data ## load_relation __Args__: - * `relation`: The [Relation](dbt-classes#relation) to try to load + * `relation`: The [Relation](/reference/dbt-classes#relation) to try to load -A convenience wrapper for [get_relation](#get_relation). Returns the cached version of the [Relation](dbt-classes#relation) object, or `None` if the relation does not exist. +A convenience wrapper for [get_relation](#get_relation). Returns the cached version of the [Relation](/reference/dbt-classes#relation) object, or `None` if the relation does not exist. **Usage**: @@ -120,7 +124,7 @@ A convenience wrapper for [get_relation](#get_relation). Returns the cached vers ```sql -{% set relation_exists = (load_relation(ref('my_model')) is not none %} +{% set relation_exists = load_relation(ref('my_model')) is not none %} {% if relation_exists %} {{ log("my_model has already been built", info=true) }} {% else %} @@ -135,9 +139,9 @@ A convenience wrapper for [get_relation](#get_relation). Returns the cached vers ## get_columns_in_relation __Args__: - * `relation`: The [Relation](dbt-classes#relation) to find the columns for + * `relation`: The [Relation](/reference/dbt-classes#relation) to find the columns for -Returns a list of [Columns](dbt-classes#column) in a . +Returns a list of [Columns](/reference/dbt-classes#column) in a . **Usage**: @@ -273,7 +277,7 @@ __Args__: * `schema_name`: The schema to test * `table_name`: The (or view) from which to select columns -Returns a list of [Columns](dbt-classes#column) in a . +Returns a list of [Columns](/reference/dbt-classes#column) in a . diff --git a/website/docs/reference/dbt-jinja-functions/as_bool.md b/website/docs/reference/dbt-jinja-functions/as_bool.md index 8e0430b8bfe..d4c2bbf1743 100644 --- a/website/docs/reference/dbt-jinja-functions/as_bool.md +++ b/website/docs/reference/dbt-jinja-functions/as_bool.md @@ -1,6 +1,8 @@ --- -title: "as_bool" +title: "About as_bool filter" +sidebar_label: "as_bool" id: "as_bool" +description: "Use this filter to coerce a Jinja output into boolean value." --- The `as_bool` Jinja filter will coerce Jinja-compiled output into a boolean @@ -22,10 +24,3 @@ models: ``` - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_bool` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_native.md b/website/docs/reference/dbt-jinja-functions/as_native.md index 1ff5980e69c..1de9ad45bf9 100644 --- a/website/docs/reference/dbt-jinja-functions/as_native.md +++ b/website/docs/reference/dbt-jinja-functions/as_native.md @@ -1,23 +1,18 @@ --- -title: "as_native" +title: "About as_native filter" +sidebar_label: "as_native" id: "as_native" +description: "Use this filter to coerce Jinja-compiled output into its native python." --- The `as_native` Jinja filter will coerce Jinja-compiled output into its Python native representation according to [`ast.literal_eval`](https://docs.python.org/3/library/ast.html#ast.literal_eval). The result can be any Python native type (set, list, tuple, dict, etc). -To render boolean and numeric values, it is recommended to use [`as_bool`](as_bool) -and [`as_number`](as_number) instead. +To render boolean and numeric values, it is recommended to use [`as_bool`](/reference/dbt-jinja-functions/as_bool) +and [`as_number`](/reference/dbt-jinja-functions/as_number) instead. :::danger Proceed with caution Unlike `as_bool` and `as_number`, `as_native` will return a rendered value regardless of the input type. Ensure that your inputs match expectations. ::: - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_native` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_number.md b/website/docs/reference/dbt-jinja-functions/as_number.md index eb74b94e843..29b35094880 100644 --- a/website/docs/reference/dbt-jinja-functions/as_number.md +++ b/website/docs/reference/dbt-jinja-functions/as_number.md @@ -1,6 +1,8 @@ --- -title: "as_number" +title: "About as_number filter" +sidebar_label: "as_number" id: "as_number" +description: "Use this filter to convert Jinja-compiled output to a numeric value.." --- The `as_number` Jinja filter will coerce Jinja-compiled output into a numeric @@ -23,10 +25,3 @@ my_profile: ```
          - - - -* `v0.17.1`: Native rendering is disabled by default. The `as_number` filter was -introduced. - - diff --git a/website/docs/reference/dbt-jinja-functions/as_text.md b/website/docs/reference/dbt-jinja-functions/as_text.md index 4b12789dbd5..6b26cfa327d 100644 --- a/website/docs/reference/dbt-jinja-functions/as_text.md +++ b/website/docs/reference/dbt-jinja-functions/as_text.md @@ -1,18 +1,20 @@ --- -title: "as_text" +title: "About as_text filter" +sidebar_label: "as_text" id: "as_text" +description: "Use this filter to convert Jinja-compiled output back to text." --- The `as_text` Jinja filter will coerce Jinja-compiled output back to text. It -can be used in yaml rendering contexts where values _must_ be provided as +can be used in YAML rendering contexts where values _must_ be provided as strings, rather than as the datatype that they look like. :::info Heads up In dbt v0.17.1, native rendering is not enabled by default. As such, the `as_text` filter has no functional effect. -It is still possible to natively render specific values using the [`as_bool`](as_bool), -[`as_number`](as_number), and [`as_native`](as_native) filters. +It is still possible to natively render specific values using the [`as_bool`](/reference/dbt-jinja-functions/as_bool), +[`as_number`](/reference/dbt-jinja-functions/as_number), and [`as_native`](/reference/dbt-jinja-functions/as_native) filters. ::: @@ -54,12 +56,3 @@ models: ```
          - - - -* `v0.17.0`: Native rendering is enabled by default. The `as_text` filter was -introduced. -* `v0.17.1`: Native rendering is disabled by default. The `as_text` filter works -as before, with no functional effect. - - diff --git a/website/docs/reference/dbt-jinja-functions/builtins.md b/website/docs/reference/dbt-jinja-functions/builtins.md index f3ee4313f2c..edc5f34ffda 100644 --- a/website/docs/reference/dbt-jinja-functions/builtins.md +++ b/website/docs/reference/dbt-jinja-functions/builtins.md @@ -1,23 +1,66 @@ --- -title: "builtins" +title: "About builtins Jinja variable" +sidebar_label: "builtins" id: "builtins" +description: "Read this guide to understand the builtins Jinja variable in dbt." --- -New in 0.16.0 The `builtins` variable exists to provide references to builtin dbt context methods. This allows macros to be created with names that _mask_ dbt builtin context methods, while still making those methods accessible in the dbt compilation context. The `builtins` variable is a dictionary containing the following keys: -- [ref](ref) +- [ref](/reference/dbt-jinja-functions/ref) - [source](/reference/dbt-jinja-functions/source) - [config](/reference/dbt-jinja-functions/config) ## Usage -The following macro overrides the `ref` method available in the model compilation context to return a [Relation](dbt-classes#relation) with the database name overriden to `dev`. +:::important + +Using the `builtins` variable in this way is an advanced development workflow. Users should be ready to maintain and update these overrides when upgrading in the future. +::: + + + +From dbt v1.5 and higher, use the following macro to extract user-provided arguments, including version, and call the builtins.ref() function with either a single modelname argument or both packagename and modelname arguments, based on the number of positional arguments in varargs: + +

          + + +``` +{% macro ref() %} +-- extract user-provided positional and keyword arguments + {% set version = kwargs.get('version') %} + {% set packagename = none %} + {%- if (varargs | length) == 1 -%} + {% set modelname = varargs[0] %} +{%- else -%} + {% set packagename = varargs[0] %} + {% set modelname = varargs[1] %} +{% endif %} +-- call builtins.ref based on provided positional arguments +{% set rel = None %} +{% if packagename is not none %} + {% set rel = return(builtins.ref(packagename, modelname, version=version)) %} +{% else %} + {% set rel = return(builtins.ref(modelname, version=version)) %} +{% endif %} + +-- finally, override the database name with "dev" +{% set newrel = rel.replace_path(database="dev") %} +{% do return(newrel) %} + +{% endmacro %} +``` +
          + + + +From dbt v1.4 and lower, use the following macro to override the `ref` method available in the model compilation context to return a [Relation](/reference/dbt-classes#relation) with the database name overriden to `dev`: ``` + {% macro ref(model_name) %} {% set rel = builtins.ref(model_name) %} @@ -26,6 +69,7 @@ The following macro overrides the `ref` method available in the model compilatio {% endmacro %} ``` + The ref macro can also be used to control which elements of the model path are rendered when run, for example the following macro overrides the `ref` method to render only the schema and object identifier, but not the database reference i.e. `my_schema.my_model` rather than `my_database.my_schema.my_model`. This is especially useful when using snowflake as a warehouse, if you intend to change the name of the database post-build and wish the references to remain accurate. diff --git a/website/docs/reference/dbt-jinja-functions/config.md b/website/docs/reference/dbt-jinja-functions/config.md index 616d8cd6d9c..c2fc8f96e5b 100644 --- a/website/docs/reference/dbt-jinja-functions/config.md +++ b/website/docs/reference/dbt-jinja-functions/config.md @@ -1,10 +1,10 @@ --- -title: "config" +title: "About config variable" +sidebar_label: "config" id: "config" +description: "Read this guide to understand the config Jinja function in dbt." --- -## Overview - The `config` variable exists to handle end-user configuration for custom materializations. Configs like `unique_key` can be implemented using the `config` variable in your own materializations. For example, code in the `incremental` materialization like this: diff --git a/website/docs/reference/dbt-jinja-functions/cross-database-macros.md b/website/docs/reference/dbt-jinja-functions/cross-database-macros.md index a7d8f2bc54b..4df8275d4bd 100644 --- a/website/docs/reference/dbt-jinja-functions/cross-database-macros.md +++ b/website/docs/reference/dbt-jinja-functions/cross-database-macros.md @@ -1,12 +1,12 @@ --- -title: "cross-database macros" +title: "About cross-database macros" +sidebar_label: "cross-database macros" id: "cross-database-macros" +description: "Read this guide to understand cross-database macros in dbt." --- # Cross-database macros -## Overview - These macros benefit three different user groups: - If you maintain a package, your package is more likely to work on other adapters by using these macros (rather than a specific database's SQL syntax) - If you maintain an adapter, your adapter is more likely to support more packages by implementing (and testing) these macros. @@ -20,66 +20,94 @@ Please make sure to take a look at the [SQL expressions section](#sql-expression -- [any_value](#any_value) -- [bool_or](#bool_or) -- [cast_bool_to_text](#cast_bool_to_text) -- [concat](#concat) -- [dateadd](#dateadd) -- [datediff](#datediff) -- [date_trunc](#date_trunc) -- [escape_single_quotes](#escape_single_quotes) -- [except](#except) -- [hash](#hash) -- [intersect](#intersect) -- [last_day](#last_day) -- [length](#length) -- [listagg](#listagg) -- [position](#position) -- [replace](#replace) -- [right](#right) -- [safe_cast](#safe_cast) -- [split_part](#split_part) -- [string_literal](#string_literal) -- [type_bigint](#type_bigint) -- [type_float](#type_float) -- [type_int](#type_int) -- [type_numeric](#type_numeric) -- [type_string](#type_string) -- [type_timestamp](#type_timestamp) +- [Cross-database macros](#cross-database-macros) + - [All functions (alphabetical)](#all-functions-alphabetical) + - [Data type functions](#data-type-functions) + - [type\_bigint](#type_bigint) + - [type\_boolean](#type_boolean) + - [type\_float](#type_float) + - [type\_int](#type_int) + - [type\_numeric](#type_numeric) + - [type\_string](#type_string) + - [type\_timestamp](#type_timestamp) + - [Set functions](#set-functions) + - [except](#except) + - [intersect](#intersect) + - [Array functions](#array-functions) + - [array\_append](#array_append) + - [array\_concat](#array_concat) + - [array\_construct](#array_construct) + - [String functions](#string-functions) + - [concat](#concat) + - [hash](#hash) + - [length](#length) + - [position](#position) + - [replace](#replace) + - [right](#right) + - [split\_part](#split_part) + - [String literal functions](#string-literal-functions) + - [escape\_single\_quotes](#escape_single_quotes) + - [string\_literal](#string_literal) + - [Aggregate and window functions](#aggregate-and-window-functions) + - [any\_value](#any_value) + - [bool\_or](#bool_or) + - [listagg](#listagg) + - [Cast functions](#cast-functions) + - [cast\_bool\_to\_text](#cast_bool_to_text) + - [safe\_cast](#safe_cast) + - [Date and time functions](#date-and-time-functions) + - [dateadd](#dateadd) + - [datediff](#datediff) + - [date\_trunc](#date_trunc) + - [last\_day](#last_day) + - [Date and time parts](#date-and-time-parts) + - [SQL expressions](#sql-expressions) -- [any_value](#any_value) -- [array_append](#array_append) -- [array_concat](#array_concat) -- [array_construct](#array_construct) -- [bool_or](#bool_or) -- [cast_bool_to_text](#cast_bool_to_text) -- [concat](#concat) -- [dateadd](#dateadd) -- [datediff](#datediff) -- [date_trunc](#date_trunc) -- [escape_single_quotes](#escape_single_quotes) -- [except](#except) -- [hash](#hash) -- [intersect](#intersect) -- [last_day](#last_day) -- [length](#length) -- [listagg](#listagg) -- [position](#position) -- [replace](#replace) -- [right](#right) -- [safe_cast](#safe_cast) -- [split_part](#split_part) -- [string_literal](#string_literal) -- [type_bigint](#type_bigint) -- [type_boolean](#type_boolean) -- [type_float](#type_float) -- [type_int](#type_int) -- [type_numeric](#type_numeric) -- [type_string](#type_string) -- [type_timestamp](#type_timestamp) +- [Cross-database macros](#cross-database-macros) + - [All functions (alphabetical)](#all-functions-alphabetical) + - [Data type functions](#data-type-functions) + - [type\_bigint](#type_bigint) + - [type\_boolean](#type_boolean) + - [type\_float](#type_float) + - [type\_int](#type_int) + - [type\_numeric](#type_numeric) + - [type\_string](#type_string) + - [type\_timestamp](#type_timestamp) + - [Set functions](#set-functions) + - [except](#except) + - [intersect](#intersect) + - [Array functions](#array-functions) + - [array\_append](#array_append) + - [array\_concat](#array_concat) + - [array\_construct](#array_construct) + - [String functions](#string-functions) + - [concat](#concat) + - [hash](#hash) + - [length](#length) + - [position](#position) + - [replace](#replace) + - [right](#right) + - [split\_part](#split_part) + - [String literal functions](#string-literal-functions) + - [escape\_single\_quotes](#escape_single_quotes) + - [string\_literal](#string_literal) + - [Aggregate and window functions](#aggregate-and-window-functions) + - [any\_value](#any_value) + - [bool\_or](#bool_or) + - [listagg](#listagg) + - [Cast functions](#cast-functions) + - [cast\_bool\_to\_text](#cast_bool_to_text) + - [safe\_cast](#safe_cast) + - [Date and time functions](#date-and-time-functions) + - [dateadd](#dateadd) + - [datediff](#datediff) + - [date\_trunc](#date_trunc) + - [last\_day](#last_day) + - [Date and time parts](#date-and-time-parts) + - [SQL expressions](#sql-expressions) diff --git a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md index 71d0ce67628..0d377d29cef 100644 --- a/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/dbt-project-yml-context.md @@ -1,26 +1,22 @@ --- -title: "dbt_project.yml Context" +title: " About dbt_project.yml context" +sidebar_label: "dbt_project.yml context" id: "dbt-project-yml-context" +description: "The context methods and variables available when configuring resources in the dbt_project.yml file." --- - - -The compilation context of the `dbt_project.yml` file is well-defined as -of dbt v0.16.0 - - - -The following context variables and methods are available when configuring +The following context methods and variables are available when configuring resources in the `dbt_project.yml` file. This applies to the `models:`, `seeds:`, and `snapshots:` keys in the `dbt_project.yml` file. -**Available context variables:** -- [target](target) -- [env_var](env_var) -- [vars](var) (_Note: only variables defined with `--vars` are available_) -- [builtins](builtins) -- [dbt_version](dbt_version) +**Available context methods:** +- [env_var](/reference/dbt-jinja-functions/env_var) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) +**Available context variables:** +- [target](/reference/dbt-jinja-functions/target) +- [builtins](/reference/dbt-jinja-functions/builtins) +- [dbt_version](/reference/dbt-jinja-functions/dbt_version) ### Example configuration diff --git a/website/docs/reference/dbt-jinja-functions/dbt_version.md b/website/docs/reference/dbt-jinja-functions/dbt_version.md index 02c31cdd41a..5a96c9bd226 100644 --- a/website/docs/reference/dbt-jinja-functions/dbt_version.md +++ b/website/docs/reference/dbt-jinja-functions/dbt_version.md @@ -1,10 +1,10 @@ --- -title: "dbt_version" +title: "About dbt_version variable" +sidebar_label: "dbt_version" id: "dbt_version" +description: "Read this guide to understand the dbt_version Jinja function in dbt." --- -New in 0.16.0 - The `dbt_version` variable returns the installed version of dbt that is currently running. It can be used for debugging or auditing purposes. diff --git a/website/docs/reference/dbt-jinja-functions/debug-method.md b/website/docs/reference/dbt-jinja-functions/debug-method.md index b7bbb52f4ea..0938970b50c 100644 --- a/website/docs/reference/dbt-jinja-functions/debug-method.md +++ b/website/docs/reference/dbt-jinja-functions/debug-method.md @@ -1,6 +1,8 @@ --- -title: "debug" +title: "About debug macro" +sidebar_label: "debug" id: "debug-method" +description: "The `{{ debug() }}` macro will open an iPython debugger." --- @@ -10,8 +12,6 @@ The `debug` macro is new in dbt v0.14.1, and is only intended to be used in a de ::: -## Overview - The `{{ debug() }}` macro will open an iPython debugger in the context of a compiled dbt macro. The `DBT_MACRO_DEBUGGING` environment value must be set to use the debugger. ## Usage diff --git a/website/docs/reference/dbt-jinja-functions/dispatch.md b/website/docs/reference/dbt-jinja-functions/dispatch.md index 46f17029fef..5dff787219f 100644 --- a/website/docs/reference/dbt-jinja-functions/dispatch.md +++ b/website/docs/reference/dbt-jinja-functions/dispatch.md @@ -1,18 +1,11 @@ --- -title: "dispatch" +sidebar_label: "dispatch" +title: "About dispatch config" +id: "dispatch" +description: "dbt extends functionality across data platforms using multiple dispatch." --- - - -- **v0.18.0:** Introduced `dispatch` as a replacement for deprecated `adapter_macro` -- **v0.19.2:** Limited rendering context for `dispatch` arguments. Includes backwards compatibility for widely used packages. -- **v0.20.0:** Parent adapters' macro implementations are included in search order. Formalized supported arguments. -- **v0.21.0:** All dispatched macros in the dbt global project include `dbt` namespace -- **v1.0.0:** The 'packages' argument is fully deprecated. Use `macro_namespace` and project-level `dispatch` config instead. - - - -dbt can extend functionality across [Supported Data Platforms](supported-data-platforms) through a system of [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch). Because SQL syntax, data types, and / support vary across adapters, dbt can define and call generic functional macros, and then "dispatch" that macro to the appropriate implementation for the current adapter. +dbt can extend functionality across [Supported Data Platforms](/docs/supported-data-platforms) through a system of [multiple dispatch](https://en.wikipedia.org/wiki/Multiple_dispatch). Because SQL syntax, data types, and / support vary across adapters, dbt can define and call generic functional macros, and then "dispatch" that macro to the appropriate implementation for the current adapter. ## Syntax @@ -43,7 +36,7 @@ dbt uses two criteria when searching for the right candidate macro: If dbt does not find an adapter-specific implementation, it will dispatch to the default implementation. -**Namespace:** Generally, dbt will search for implementations in the root project and internal projects (e.g. `dbt`, `dbt_postgres`). If the `macro_namespace` argument is provided, it instead searches the specified namespace (package) for viable implementations. It is also possible to dynamically route namespace searching by defining a [`dispatch` project config](project-configs/dispatch-config); see the examples below for details. +**Namespace:** Generally, dbt will search for implementations in the root project and internal projects (e.g. `dbt`, `dbt_postgres`). If the `macro_namespace` argument is provided, it instead searches the specified namespace (package) for viable implementations. It is also possible to dynamically route namespace searching by defining a [`dispatch` project config](/reference/project-configs/dispatch-config); see the examples below for details. ## Examples @@ -125,7 +118,7 @@ Following the second example above: Whenever I call my version of the `concat` m Why does this matter? Other macros in dbt-utils, such as `surrogate_key`, call the `dbt_utils.concat` macro directly. What if I want `dbt_utils.surrogate_key` to use _my_ version of `concat` instead, including my custom logic on Redshift? -As a user, I can accomplish this via a [project-level `dispatch` config](project-configs/dispatch-config). When dbt goes to dispatch `dbt_utils.concat`, it knows from the `macro_namespace` argument to search in the `dbt_utils` namespace. The config below defines dynamic routing for that namespace, telling dbt to search through an ordered sequence of packages, instead of just the `dbt_utils` package. +As a user, I can accomplish this via a [project-level `dispatch` config](/reference/project-configs/dispatch-config). When dbt goes to dispatch `dbt_utils.concat`, it knows from the `macro_namespace` argument to search in the `dbt_utils` namespace. The config below defines dynamic routing for that namespace, telling dbt to search through an ordered sequence of packages, instead of just the `dbt_utils` package. @@ -156,7 +149,7 @@ I maintain an internal utility package at my organization, named `my_org_dbt_hel My package can define custom versions of any dispatched global macro I choose, from `generate_schema_name` to `test_unique`. I can define a new default version of that macro (e.g. `default__generate_schema_name`), or custom versions for specific adapters (e.g. `spark__generate_schema_name`). -Each root project installing my package simply needs to include the [project-level `dispatch` config](project-configs/dispatch-config) that searches my package ahead of `dbt` for the `dbt` global namespace: +Each root project installing my package simply needs to include the [project-level `dispatch` config](/reference/project-configs/dispatch-config) that searches my package ahead of `dbt` for the `dbt` global namespace: @@ -168,6 +161,24 @@ dispatch: +### Managing different global overrides across packages + +You can override global behaviors in different ways for each project that is installed as a package. This holds true for all global macros: `generate_schema_name`, `create_table_as`, etc. When parsing or running a resource defined in a package, the definition of the global macro within that package takes precedence over the definition in the root project because it's more specific to those resources. + +By combining package-level overrides and `dispatch`, it is possible to achieve three different patterns: + +1. **Package always wins** — As the developer of dbt models in a project that will be deployed elsewhere as a package, You want full control over the macros used to define & materialize my models. Your macros should always take precedence for your models, and there should not be any way to override them. + + - _Mechanism:_ Each project/package fully overrides the macro by its name, for example, `generate_schema_name` or `create_table_as`. Do not use dispatch. + +2. **Conditional application (root project wins)** — As the maintainer of one dbt project in a mesh of multiple, your team wants conditional application of these rules. When running your project standalone (in development), you want to apply custom behavior; but when installed as a package and deployed alongside several other projects (in production), you want the root-level project's rules to apply. + + - _Mechanism:_ Each package implements its "local" override by registering a candidate for dispatch with an adapter prefix, for example, `default__generate_schema_name` or `default__create_table_as`. The root-level project can then register its own candidate for dispatch (`default__generate_schema_name`), winning the default search order or by explicitly overriding the macro by name (`generate_schema_name`). + +3. **Same rules everywhere all the time** — As a member of the data platform team responsible for consistency across teams at your organization, you want to create a "macro package" that every team can install & use. + + - _Mechanism:_ Create a standalone package of candidate macros only, for example, `default__generate_schema_name` or `default__create_table_as`. Add a [project-level `dispatch` configuration](/reference/project-configs/dispatch-config) in every project's `dbt_project.yml`. + ## For adapter plugin maintainers Most packages were initially designed to work on the four original dbt adapters. By using the `dispatch` macro and project config, it is possible to "shim" existing packages to work on other adapters, by way of third-party compatibility packages. @@ -213,7 +224,7 @@ As a `dbt-spark` user, by installing `dbt_utils` and `spark_utils` together, I d ### Adapter inheritance -Some adapters "inherit" from other adapters (e.g. `dbt-postgres` → `dbt-redshift`). If using a child adapter, dbt will include any parent adapter implementations in its search order, too. Instead of just looking for `redshift__` and falling back to `default__`, dbt will look for `redshift__`, `postgres__`, and `default__`, in that order. +Some adapters "inherit" from other adapters (e.g. `dbt-postgres` → `dbt-redshift`, and `dbt-spark` → `dbt-databricks`). If using a child adapter, dbt will include any parent adapter implementations in its search order, too. Instead of just looking for `redshift__` and falling back to `default__`, dbt will look for `redshift__`, `postgres__`, and `default__`, in that order. Child adapters tend to have very similar SQL syntax to their parents, so this allows them to skip reimplementing a macro that has already been reimplemented by the parent adapter. @@ -249,4 +260,4 @@ In rare cases, the child adapter may prefer the default implementation to its pa ## FAQs - + diff --git a/website/docs/reference/dbt-jinja-functions/doc.md b/website/docs/reference/dbt-jinja-functions/doc.md index fc4df8e18be..51ca6ad2059 100644 --- a/website/docs/reference/dbt-jinja-functions/doc.md +++ b/website/docs/reference/dbt-jinja-functions/doc.md @@ -1,9 +1,11 @@ --- -title: "doc" +title: " About doc function" +sidebar_label: "doc" id: "doc" +description: "Use the `doc` to reference docs blocks in description fields." --- -The `doc` function is used to reference docs blocks in the description field of schema.yml files. It is analogous to the `ref` function. For more information, consult the [Documentation guide](documentation). +The `doc` function is used to reference docs blocks in the description field of schema.yml files. It is analogous to the `ref` function. For more information, consult the [Documentation guide](/docs/collaborate/documentation). Usage: diff --git a/website/docs/reference/dbt-jinja-functions/env_var.md b/website/docs/reference/dbt-jinja-functions/env_var.md index b956ece728a..f4cc05cec0f 100644 --- a/website/docs/reference/dbt-jinja-functions/env_var.md +++ b/website/docs/reference/dbt-jinja-functions/env_var.md @@ -1,6 +1,8 @@ --- -title: "env_var" +title: " About env_var function" +sidebar_label: "env_var" id: "env_var" +description: "Incorporate environment variables using `en_var` function." --- The `env_var` function can be used to incorporate Environment Variables from the system into your dbt project. This `env_var` function can be used in your `profiles.yml` file, the `dbt_project.yml` file, the `sources.yml` file, your `schema.yml` files, and in model `.sql` files. Essentially `env_var` is available anywhere dbt processes jinja code. @@ -34,7 +36,7 @@ If passing an environment variable for a property that uses an integer type (for :::caution Quoting, Curly Brackets, & You -Be sure to quote the entire jinja string (as shown above), or else the yaml parser will be confused by the Jinja curly brackets. +Be sure to quote the entire jinja string (as shown above), or else the YAML parser will be confused by the Jinja curly brackets. ::: @@ -56,13 +58,6 @@ models: ### Secrets - - - - **v0.21.0:** Introduced `DBT_ENV_SECRET_` and log scrubbing - - **v1.0.0:** Restricted use of secret env vars to `profiles.yml` and `packages.yml` - - - For certain configurations, you can use "secret" env vars. Any env var named with the prefix `DBT_ENV_SECRET_` will be: - Available for use in `profiles.yml` + `packages.yml`, via the same `env_var()` function - Disallowed everywhere else, including `dbt_project.yml` and model SQL, to prevent accidentally writing these secret values to the or metadata artifacts @@ -70,17 +65,20 @@ For certain configurations, you can use "secret" env vars. Any env var named wit The primary use case of secret env vars is git access tokens for [private packages](/docs/build/packages#private-packages). -**Note:** When dbt is loading profile credentials and package configuration, secret env vars will be replaced with the string value of the environment variable. You cannot modify secrets using Jinja filters, including type-casting filters such as [`as_number`](as_number) or [`as_bool`](as_bool), or pass them as arguments into other Jinja macros. - -### Custom metadata - - +**Note:** When dbt is loading profile credentials and package configuration, secret env vars will be replaced with the string value of the environment variable. You cannot modify secrets using Jinja filters, including type-casting filters such as [`as_number`](/reference/dbt-jinja-functions/as_number) or [`as_bool`](/reference/dbt-jinja-functions/as_bool), or pass them as arguments into other Jinja macros. You can only use _one secret_ per configuration: +```yml +# works +host: "{{ env_var('DBT_ENV_SECRET_HOST') }}" - - **v0.19.0:** Introduced `DBT_ENV_CUSTOM_ENV_` prefix and artifact `metadata.env` +# does not work +host: "www.{{ env_var('DBT_ENV_SECRET_HOST_DOMAIN') }}.com/{{ env_var('DBT_ENV_SECRET_HOST_PATH') }}" +``` - +### Custom metadata -Any env var named with the prefix `DBT_ENV_CUSTOM_ENV_` will be included in [dbt artifacts](dbt-artifacts#common-metadata), in a `metadata.env` dictionary, with its prefix-stripped name as its key. +Any env var named with the prefix `DBT_ENV_CUSTOM_ENV_` will be included in two places, with its prefix-stripped name as the key: +- [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata): `metadata` -> `env` +- [events and structured logs](/reference/events-logging#info-fields): `info` -> `extra` diff --git a/website/docs/reference/dbt-jinja-functions/exceptions.md b/website/docs/reference/dbt-jinja-functions/exceptions.md index 77cfacb8309..3242a1cd075 100644 --- a/website/docs/reference/dbt-jinja-functions/exceptions.md +++ b/website/docs/reference/dbt-jinja-functions/exceptions.md @@ -1,6 +1,8 @@ --- -title: "exceptions" +title: "About exceptions namespace" +sidebar_label: "exceptions" id: "exceptions" +description: "Raise warnings/errors with the `exceptions` namespace." --- The `exceptions` namespace can be used to raise warnings and errors in dbt userspace. @@ -23,7 +25,7 @@ __Example usage__: ## warn -The `exceptions.warn` method will raise a compiler warning with the provided message. If the `--warn-error` flag is provided to dbt, then this warning will be elevated to an exception, which is raised. +The `exceptions.warn` method will raise a compiler warning with the provided message, but any model will still be successful and be treated as a PASS. If the `--warn-error` flag is provided to dbt, then this warning will be elevated to an exception, which is raised. __Example usage__: diff --git a/website/docs/reference/dbt-jinja-functions/execute.md b/website/docs/reference/dbt-jinja-functions/execute.md index e3febefad3a..05dd26a8ecf 100644 --- a/website/docs/reference/dbt-jinja-functions/execute.md +++ b/website/docs/reference/dbt-jinja-functions/execute.md @@ -1,6 +1,8 @@ --- -title: "execute" +title: "About execute variable" +sidebar_label: "execute" id: "execute" +description: "Use `execute` to return True when dbt is in 'execute' mode." --- `execute` is a Jinja variable that returns True when dbt is in "execute" mode. diff --git a/website/docs/reference/dbt-jinja-functions/flags.md b/website/docs/reference/dbt-jinja-functions/flags.md index ba393378269..6fe985cc93e 100644 --- a/website/docs/reference/dbt-jinja-functions/flags.md +++ b/website/docs/reference/dbt-jinja-functions/flags.md @@ -1,6 +1,8 @@ --- -title: "flags" +title: "About flags variable" +sidebar_label: "flags" id: "flags" +description: "The `flags` variable contains values of flags provided on the cli." --- The `flags` variable contains values of flags provided on the command line. @@ -25,26 +27,75 @@ Recommended use cases include: - different logic based on "run modes," such as `flags.FULL_REFRESH` and `flags.STORE_FAILURES` - running hooks conditionally based on the current command / task type, via `flags.WHICH` -**Note:** It is _not_ recommended to use flags as an input to parse-time configurations, properties, or dependencies (`ref` + `source`). Flags are likely to change in every invocation of dbt, and their parsed values will become stale (and yield incorrect results) in subsequent invocations that have partial parsing enabled. For more details, see [the docs on parsing](parsing). +**Note:** It is _not_ recommended to use flags as an input to parse-time configurations, properties, or dependencies (`ref` + `source`). Flags are likely to change in every invocation of dbt, and their parsed values will become stale (and yield incorrect results) in subsequent invocations that have partial parsing enabled. For more details, see [the docs on parsing](/reference/parsing). + ### invocation_args_dict -For the full set of information passed from the CLI—subcommand, flags, arguments—you can use `invocation_args_dict`. This is equivalent to the `args` dictionary in [`run_results.json`](run-results-json). +For the full set of information passed from the CLI—subcommand, flags, arguments—you can use `invocation_args_dict`. This is equivalent to the `args` dictionary in [`run_results.json`](/reference/artifacts/run-results-json). + + ```sql --- models/my_model.sql +-- invocation_args_dict: -- {{ invocation_args_dict }} + +-- dbt_metadata_envs: -- {{ dbt_metadata_envs }} select 1 as id ``` -Compiles to: + + + + + +```shell +$ DBT_ENV_CUSTOM_ENV_MYVAR=myvalue dbt compile -s my_model +``` + + + ```sql +-- invocation_args_dict: -- {'write_json': True, 'use_colors': True, 'printer_width': 80, 'version_check': True, 'partial_parse': True, 'static_parser': True, 'profiles_dir': '/Users/.../.dbt', 'send_anonymous_usage_stats': False, 'event_buffer_size': 100000, 'quiet': False, 'no_print': False, 'parse_only': False, 'which': 'compile', 'rpc_method': 'compile', 'indirect_selection': 'eager'} +-- dbt_metadata_envs: +-- {'MYVAR': 'myvalue'} + +select 1 as id +``` + + + + + + + + +The `invocation_command` key within `invocation_args_dict` includes the entire subcommand when it compiles: + +```shell +$ DBT_ENV_CUSTOM_ENV_MYVAR=myvalue dbt compile -s my_model + +12:10:22 Running with dbt=1.6.0-b8 +12:10:22 Registered adapter: postgres=1.6.0-b8 +12:10:22 Found 1 seed, 1 model, 349 macros +12:10:22 +12:10:22 Concurrency: 5 threads (target='dev') +12:10:22 +12:10:22 Compiled node 'my_model' is: +-- invocation_args_dict: +-- {'log_format_file': 'debug', 'log_level': 'info', 'exclude': (), 'send_anonymous_usage_stats': True, 'which': 'compile', 'defer': False, 'output': 'text', 'log_format': 'default', 'macro_debugging': False, 'populate_cache': True, 'static_parser': True, 'vars': {}, 'warn_error_options': WarnErrorOptions(include=[], exclude=[]), 'quiet': False, 'select': ('my_model',), 'indirect_selection': 'eager', 'strict_mode': False, 'version_check': False, 'enable_legacy_logger': False, 'log_path': '/Users/jerco/dev/scratch/testy/logs', 'profiles_dir': '/Users/jerco/.dbt', 'invocation_command': 'dbt compile -s my_model', 'log_level_file': 'debug', 'project_dir': '/Users/jerco/dev/scratch/testy', 'favor_state': False, 'use_colors_file': True, 'write_json': True, 'partial_parse': True, 'printer_width': 80, 'print': True, 'cache_selected_only': False, 'use_colors': True, 'introspect': True} + +-- dbt_metadata_envs: +-- {'MYVAR': 'myvalue'} + select 1 as id ``` + + diff --git a/website/docs/reference/dbt-jinja-functions/fromjson.md b/website/docs/reference/dbt-jinja-functions/fromjson.md index 4fcc3ad9885..5c186599a0e 100644 --- a/website/docs/reference/dbt-jinja-functions/fromjson.md +++ b/website/docs/reference/dbt-jinja-functions/fromjson.md @@ -1,6 +1,8 @@ --- -title: "fromjson" +title: "About fromjson context method" +sidebar_label: "fromjson" id: "fromjson" +description: "Deserialize a json string into python with `fromjson` context method." --- The `fromjson` context method can be used to deserialize a json string into a Python object primitive, eg. a `dict` or `list`. diff --git a/website/docs/reference/dbt-jinja-functions/fromyaml.md b/website/docs/reference/dbt-jinja-functions/fromyaml.md index 78c2975da0f..d876e383c02 100644 --- a/website/docs/reference/dbt-jinja-functions/fromyaml.md +++ b/website/docs/reference/dbt-jinja-functions/fromyaml.md @@ -1,12 +1,14 @@ --- -title: "fromyaml" +title: "About fromyaml context method" +sidebar_label: "fromyaml" id: "fromyaml" +description: "Deserialize a YAML string into python with `fromyaml` context method." --- -The `fromyaml` context method can be used to deserialize a yaml string into a Python object primitive, eg. a `dict` or `list`. +The `fromyaml` context method can be used to deserialize a YAML string into a Python object primitive, eg. a `dict` or `list`. __Args__: - * `string`: The yaml string to deserialize (required) + * `string`: The YAML string to deserialize (required) * `default`: A default value to return if the `string` argument cannot be deserialized (optional) ### Usage: @@ -24,7 +26,7 @@ dogs: {% do log(my_dict['dogs'], info=true) %} -- ["good", "bad"] -{% do my_dict['dogs'].pop() } +{% do my_dict['dogs'].pop() %} {% do log(my_dict['dogs'], info=true) %} -- ["good"] ``` diff --git a/website/docs/reference/dbt-jinja-functions/graph.md b/website/docs/reference/dbt-jinja-functions/graph.md index 23c8077d367..491b7836f45 100644 --- a/website/docs/reference/dbt-jinja-functions/graph.md +++ b/website/docs/reference/dbt-jinja-functions/graph.md @@ -1,6 +1,8 @@ --- -title: "graph" +title: "About graph context variable" +sidebar_label: "graph" id: "graph" +description: "The `graph` context variable contains info about nodes in your project." --- The `graph` context variable contains information about the _nodes_ in your dbt @@ -9,7 +11,7 @@ projects. :::danger Heads up -dbt actively builds the `graph` variable during the [parsing phase](execute) of +dbt actively builds the `graph` variable during the [parsing phase](/reference/dbt-jinja-functions/execute) of running dbt projects, so some properties of the `graph` context variable will be missing or incorrect during parsing. Please read the information below carefully to understand how to effectively use this variable. @@ -18,17 +20,11 @@ to understand how to effectively use this variable. ### The graph context variable - - - - In dbt v0.17.0, sources were moved out of the `graph.nodes` object and into the `graph.sources` object - - In dbt v0.20.0, exposures were added to the `graph.exposures` object - - In dbt v1.0.0, metrics were added to the `graph.metrics` object - - - The `graph` context variable is a dictionary which maps node ids onto dictionary representations of those nodes. A simplified example might look like: + + ```json { "nodes": { @@ -74,6 +70,68 @@ representations of those nodes. A simplified example might look like: } ``` + + + + +```json +{ + "nodes": { + "model.my_project.model_name": { + "unique_id": "model.my_project.model_name", + "config": {"materialized": "table", "sort": "id"}, + "tags": ["abc", "123"], + "path": "models/path/to/model_name.sql", + ... + }, + ... + }, + "sources": { + "source.my_project.snowplow.event": { + "unique_id": "source.my_project.snowplow.event", + "database": "analytics", + "schema": "analytics", + "tags": ["abc", "123"], + "path": "models/path/to/schema.yml", + ... + }, + ... + }, + "exposures": { + "exposure.my_project.traffic_dashboard": { + "unique_id": "exposure.my_project.traffic_dashboard", + "type": "dashboard", + "maturity": "high", + "path": "models/path/to/schema.yml", + ... + }, + ... + }, + "metrics": { + "metric.my_project.count_all_events": { + "unique_id": "metric.my_project.count_all_events", + "type": "count", + "path": "models/path/to/schema.yml", + ... + }, + ... + }, + "groups": { + "group.my_project.finance": { + "unique_id": "group.my_project.finance", + "name": "finance", + "owner": { + "email": "finance@jaffleshop.com" + } + ... + }, + ... + } +} +``` + + + The exact contract for these model and source nodes is not currently documented, but that will change in the future. @@ -81,7 +139,7 @@ but that will change in the future. The `model` entries in the `graph` dictionary will be incomplete or incorrect during parsing. If accessing the models in your project via the `graph` -variable, be sure to use the [execute](execute) flag to ensure that this code +variable, be sure to use the [execute](/reference/dbt-jinja-functions/execute) flag to ensure that this code only executes at run-time and not at parse-time. Do not use the `graph` variable to build your DAG, as the resulting dbt behavior will be undefined and likely incorrect. Example usage: @@ -167,7 +225,7 @@ select * from ( ### Accessing exposures -To access the sources in your dbt project programmatically, use the `exposures` +To access the exposures in your dbt project programmatically, use the `exposures` attribute of the `graph` object. Example usage: @@ -209,11 +267,11 @@ Example usage: To access the metrics in your dbt project programmatically, use the `metrics` attribute of the `graph` object. +Example usage: + ```sql -Example usage: - {% macro get_metric_sql_for(metric_name) %} {% set metrics = graph.metrics.values() %} @@ -236,3 +294,26 @@ Example usage: ``` + +### Accessing groups + +To access the groups in your dbt project programmatically, use the `groups` attribute of the `graph` object. + +Example usage: + + + +```sql + +{% macro get_group_owner_for(group_name) %} + + {% set groups = graph.groups.values() %} + + {% set owner = (groups | selectattr('owner', 'equalto', group_name) | list).pop() %} + + {{ return(owner) }} + +{% endmacro %} +``` + + diff --git a/website/docs/reference/dbt-jinja-functions/invocation_id.md b/website/docs/reference/dbt-jinja-functions/invocation_id.md index 7b43cbe5201..ded13fc4208 100644 --- a/website/docs/reference/dbt-jinja-functions/invocation_id.md +++ b/website/docs/reference/dbt-jinja-functions/invocation_id.md @@ -1,19 +1,14 @@ --- -title: "invocation_id" +title: "About invocation_id" +sidebar_label: "invocation_id" id: "invocation_id" +description: "The `invocation_id` outputs a UUID generated for this dbt command." --- -The `invocation_id` outputs a UUID generated for this dbt command. This value is -useful when auditing or analyzing dbt invocation metadata. - - - -- Support for `query-comment` is new in v0.16.1 -- Automatic inclusion of `invocation_id` in both dbt artifacts and BigQuery jobs is new in v0.19.0 - - +The `invocation_id` outputs a UUID generated for this dbt command. This value is useful when auditing or analyzing dbt invocation metadata. If available, the `invocation_id` is: -- available in the compilation context of [`query-comment`](query-comment) -- included in the `metadata` field of dbt artifacts +- available in the compilation context of [`query-comment`](/reference/project-configs/query-comment) +- included in the `info` dictionary in dbt [events and logs](/reference/events-logging#info) +- included in the `metadata` dictionary in [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata) - included as a label in all BigQuery jobs that dbt originates diff --git a/website/docs/reference/dbt-jinja-functions/local-md5.md b/website/docs/reference/dbt-jinja-functions/local-md5.md new file mode 100644 index 00000000000..7b8405e4b7f --- /dev/null +++ b/website/docs/reference/dbt-jinja-functions/local-md5.md @@ -0,0 +1,20 @@ +--- +title: "About local_md5 context variable" +sidebar_label: "local_md5" +id: "local_md5" +description: "Calculate an MD5 hash of a string with `local_md5` context variable." +--- + +The `local_md5` context variable calculates an [MD5 hash](https://en.wikipedia.org/wiki/MD5) of the given string. The string `local_md5` emphasizes that the hash is calculated _locally_, in the dbt-Jinja context. This variable is typically useful for advanced use cases. For example, when you generate unique identifiers within custom materialization or operational logic, you can either avoid collisions between temporary relations or identify changes by comparing checksums. + +It is different than the `md5` SQL function, supported by many SQL dialects, which runs remotely in the data platform. You want to always use SQL hashing functions when generating surrogate keys. + +Usage: +```sql +-- source +{%- set value_hash = local_md5("hello world") -%} +'{{ value_hash }}' + +-- compiled +'5eb63bbbe01eeed093cb22bb8f5acdc3' +``` diff --git a/website/docs/reference/dbt-jinja-functions/log.md b/website/docs/reference/dbt-jinja-functions/log.md index cf859686038..30e68f8c21d 100644 --- a/website/docs/reference/dbt-jinja-functions/log.md +++ b/website/docs/reference/dbt-jinja-functions/log.md @@ -1,6 +1,8 @@ --- title: "log" +sibebar_label: "About log function" id: "log" +description: "Learn more about the log Jinja function in dbt." --- __Args__: @@ -10,7 +12,34 @@ __Args__: Logs a line to either the log file or stdout. -([Source on Github](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/context/base.py#L432)) +
          + Code source + Refer to GitHub or the following code as a source:

          + +```python + def log(msg: str, info: bool = False) -> str: + """Logs a line to either the log file or stdout. + + :param msg: The message to log + :param info: If `False`, write to the log file. If `True`, write to + both the log file and stdout. + + > macros/my_log_macro.sql + + {% macro some_macro(arg1, arg2) %} + {{ log("Running some_macro: " ~ arg1 ~ ", " ~ arg2) }} + {% endmacro %}" + """ + if info: + fire_event(JinjaLogInfo(msg=msg, node_info=get_node_info())) + else: + fire_event(JinjaLogDebug(msg=msg, node_info=get_node_info())) + return "" +``` + + + +
          ```sql diff --git a/website/docs/reference/dbt-jinja-functions/model.md b/website/docs/reference/dbt-jinja-functions/model.md index 83a29e648da..9ccf0759470 100644 --- a/website/docs/reference/dbt-jinja-functions/model.md +++ b/website/docs/reference/dbt-jinja-functions/model.md @@ -1,8 +1,61 @@ --- -title: "model" +title: "About model object" +sidebar_label: "model" id: "model" +description: "`model` is the dbt graph object (or node) for the current model." --- -`model` is the dbt [graph object](graph) (or node) for the current model. It can be used to: +`model` is the dbt [graph object](/reference/dbt-jinja-functions/graph) (or node) for the current model. It can be used to: - Access `config` settings, say, in a post-hook - Access the path to the model + +For example: +```jinja +{% if model.config.materialization = 'view' %} + {{ log(model.name ~ " is a view.", info=True) }} +{% endif %} +``` + +To view the contents of `model` for a given model: + + + + + +If you're using the CLI, use [log()](/reference/dbt-jinja-functions/log) to print the full contents: + +```jinja +{{ log(model, info=True) }} +``` + + + + + + If you're using the dbt Cloud IDE, compile the following to print the full contents:

          + + ```jinja +{{ model | tojson(indent = 4) }} +``` + +
          + +
          + +## Model structure and JSON schema + +To view the structure of `models` and their definitions: +- Refer to [dbt JSON Schema](https://schemas.getdbt.com/) for describing and consuming dbt generated artifacts +- Select the corresponding manifest version under **Manifest**. For example if you're on dbt v1.3, then you would select Manifest v7 + * The `manifest.json` version number is related to (but not _equal_ to) your dbt version, so you _must_ use the correct `manifest.json` version for your dbt version. To find the correct `manifest.json` version, refer to [Manifest](/reference/artifacts/manifest-json) and select the dbt version on the top navigation (such as `v1.5`). This will help you find out which tags are associated with your model. +- Then go to `nodes` --> Select Additional properties --> `CompiledModelNode` or view other definitions/objects. + +Use the following table to understand how the versioning pattern works and match the Manifest version with the dbt version: + +import ManifestVersions from '/snippets/_manifest-versions.md'; + + + +## Related docs + +- [dbt JSON Schema](https://schemas.getdbt.com/) diff --git a/website/docs/reference/dbt-jinja-functions/modules.md b/website/docs/reference/dbt-jinja-functions/modules.md index baa8da80f13..a30367f9dd6 100644 --- a/website/docs/reference/dbt-jinja-functions/modules.md +++ b/website/docs/reference/dbt-jinja-functions/modules.md @@ -1,6 +1,8 @@ --- -title: "modules" +title: "About modules variable" +sidebar_label: "modules" id: "modules" +description: "`modules` jinja variables has useful Python modules to operate data." --- The `modules` variable in the Jinja context contains useful Python modules for operating on data. @@ -18,7 +20,7 @@ It includes the modules contexts of `date`, `datetime`, `time`, `timedelta`, and ``` This module will return the current date and time on every Jinja evaluation. For the date and time of the start of the run, please see -[run_started_at](run_started_at). +[run_started_at](/reference/dbt-jinja-functions/run_started_at). ## pytz This variable is a pointer to the Python [pytz](https://pypi.org/project/pytz/) module, which supports timezone logic. diff --git a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md index 4d7a6cbd15c..32cd8ca10ff 100644 --- a/website/docs/reference/dbt-jinja-functions/on-run-end-context.md +++ b/website/docs/reference/dbt-jinja-functions/on-run-end-context.md @@ -1,6 +1,8 @@ --- -title: "on-run-end Context" +title: "About on-run-end context variable" +sidebar_label: "on-run-end context" id: "on-run-end-context" +description: "Use these variables in the context for `on-run-end` hooks." --- @@ -11,6 +13,7 @@ These variables are only available in the context for `on-run-end` hooks. They w ::: ## schemas + The `schemas` context variable can be used to reference the schemas that dbt has built models into during a run of dbt. This variable can be used to grant usage on these schemas to certain users at the end of a dbt run. Example: @@ -60,7 +63,7 @@ on-run-end: ## database_schemas -The `database_schemas` context variable can be used to reference the databases _and_ schemas that dbt has built models into during a run of dbt. This variable is similar to the `schemas` variable, and should be used if a dbt run builds resources into multiple different database. +The `database_schemas` context variable can be used to reference the databases _and_ schemas that dbt has built models into during a run of dbt. This variable is similar to the `schemas` variable, and should be used if a dbt run builds resources into multiple different databases. Example: @@ -97,13 +100,7 @@ on-run-end: ## Results - - -* `v0.19.0`: The `Result` object significantly changed its schema. See https://schemas.getdbt.com/dbt/run-results/v1.json for the full specification. - - - -The `results` variable contains a list of [Result objects](dbt-classes#result-objects) with one element per resource that executed in the dbt job. The Result object provides access within the Jinja on-run-end context to the information that will populate the [run results JSON artifact](run-results-json). +The `results` variable contains a list of [Result objects](/reference/dbt-classes#result-objects) with one element per resource that executed in the dbt job. The Result object provides access within the Jinja on-run-end context to the information that will populate the [run results JSON artifact](/reference/artifacts/run-results-json). Example usage: diff --git a/website/docs/reference/dbt-jinja-functions/print.md b/website/docs/reference/dbt-jinja-functions/print.md index 25ced86da62..219320a945a 100644 --- a/website/docs/reference/dbt-jinja-functions/print.md +++ b/website/docs/reference/dbt-jinja-functions/print.md @@ -1,18 +1,19 @@ --- -title: "print" +title: "About print function" +sidebar_label: "print" id: "print" +description: "Use the `print()` to print messages to the log file and stdout." --- -## About print Use the `print()` function when you want to print messages to both the log file and standard output (stdout). -When used in conjunction with the `QUIET` global config, which suppresses non-error logs, you will only see error logs and the print messages in stdout. For more information, see [Global configs](/reference/global-configs). +When used in conjunction with the `QUIET` global config, which suppresses non-error logs, you will only see error logs and the print messages in stdout. For more information, see [Global configs](/reference/global-configs/about-global-configs). ## Example ```sql {% macro some_macro(arg1, arg2) %} {{ print("Running some_macro: " ~ arg1 ~ ", " ~ arg2) }} - {% endmacro %}" + {% endmacro %} ``` diff --git a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md index fb5d92da988..2a6390c3d12 100644 --- a/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md +++ b/website/docs/reference/dbt-jinja-functions/profiles-yml-context.md @@ -1,14 +1,16 @@ --- -title: "profiles.yml Context" +title: "About profiles.yml context" +sidebar_label: "profiles.yml context" id: "profiles-yml-context" +description: "Use these context methods to configure resources in `profiles.yml` file." --- -The following context variables and methods are available when configuring +The following context methods are available when configuring resources in the `profiles.yml` file. -**Available context variables:** -- [env_var](env_var) -- [vars](var) (_Note: only variables defined with `--vars` are available_) +**Available context methods:** +- [env_var](/reference/dbt-jinja-functions/env_var) +- [var](/reference/dbt-jinja-functions/var) (_Note: only variables defined with `--vars` are available_) ### Example usage diff --git a/website/docs/reference/dbt-jinja-functions/project_name.md b/website/docs/reference/dbt-jinja-functions/project_name.md index 9fba9254ddf..7f76c5a4800 100644 --- a/website/docs/reference/dbt-jinja-functions/project_name.md +++ b/website/docs/reference/dbt-jinja-functions/project_name.md @@ -1,10 +1,10 @@ --- -title: "project_name" +title: "About project_name context variable" +sidebar_label: "project_name" id: "project_name" +description: "Read this guide to understand the project_name Jinja function in dbt." --- -New in 0.16.0 - The `project_name` context variable returns the `name` for the root-level project which is being run by dbt. This variable can be used to defer execution to a root-level project macro if one exists. diff --git a/website/docs/reference/dbt-jinja-functions/ref.md b/website/docs/reference/dbt-jinja-functions/ref.md index 9be20d0a226..fda5992e234 100644 --- a/website/docs/reference/dbt-jinja-functions/ref.md +++ b/website/docs/reference/dbt-jinja-functions/ref.md @@ -1,10 +1,10 @@ --- -title: "ref" +title: "About ref function" +sidebar_label: "ref" id: "ref" +description: "Read this guide to understand the builtins Jinja function in dbt." --- -## Overview - The most important function in dbt is `ref()`; it's impossible to build even moderately complex models without it. `ref()` is how you reference one model within another. This is a very common behavior, as typically models are built to be "stacked" on top of one another. Here is how this looks in practice: @@ -29,23 +29,62 @@ from {{ref('model_a')}} `ref()` is, under the hood, actually doing two important things. First, it is interpolating the schema into your model file to allow you to change your deployment schema via configuration. Second, it is using these references between models to automatically build the dependency graph. This will enable dbt to deploy models in the correct order when using `dbt run`. -:::info New in 0.9.0 +The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes as the [{{ this }} variable](/reference/dbt-jinja-functions/this). + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. -The `{{ ref }}` function returns a `Relation` object that has the same `table`, `schema`, and `name` attributes at the [{{ this }}](this) variable. +## Advanced ref usage -::: +### Versioned ref -## Advanced ref usage +The `ref` function supports an optional keyword argument - `version` (or `v`). +When a version argument is provided to the `ref` function, dbt returns to the `Relation` object corresponding to the specified version of the referenced model. + +This functionality is useful when referencing versioned models that make breaking changes by creating new versions, but guarantees no breaking changes to existing versions of the model. + +If the `version` argument is not supplied to a `ref` of a versioned model, the latest version is. This has the benefit of automatically incorporating the latest changes of a referenced model, but there is a risk of incorporating breaking changes. + +#### Example: + + +```yml + +models: + - name: model_name + latest_version: 2 + versions: + - v: 2 + - v: 1 +``` + + + +```sql + -- returns the `Relation` object corresponding to version 1 of model_name +select * from {{ ref('model_name', version=1) }} +``` + +```sql + -- returns the `Relation` object corresponding to version 2 (the latest version) of model_name +select * from {{ ref('model_name') }} +``` ### Two-argument variant -There is also a two-argument variant of the `ref` function. With this variant, you can pass both a package name and model name to `ref` to avoid ambiguity. This functionality is not commonly required for typical dbt usage. +You can also use a two-argument variant of the `ref` function. With this variant, you can pass both a namespace (project or package) and model name to `ref` to avoid ambiguity. When using two arguments with projects (not packages), you also need to set [cross project dependencies](/docs/collaborate/govern/project-dependencies). ```sql -select * from {{ ref('package_name', 'model_name') }} +select * from {{ ref('project_or_package', 'model_name') }} ``` -**Note:** The `package_name` should only include the name of the package, not the maintainer. For example, if you use the [`fivetran/stripe`](https://hub.getdbt.com/fivetran/stripe/latest/) package, type `stripe` in that argument, and not `fivetran/stripe`. +We recommend using two-argument `ref` any time you are referencing a model defined in a different package or project. While not required in all cases, it's more explicit for you, for dbt, and for future readers of your code. + + + +We especially recommend using two-argument `ref` to avoid ambiguity, in cases where a model name is duplicated across multiple projects or installed packages. If you use one-argument `ref` (just the `model_name`), dbt will look for a model by that name in the same namespace (package or project); if it finds none, it will raise an error. + + + +**Note:** The `project_or_package` should match the `name` of the project/package, as defined in its `dbt_project.yml`. This might be different from the name of the repository. It never includes the repository's organization name. For example, if you use the [`fivetran/stripe`](https://hub.getdbt.com/fivetran/stripe/latest/) package, the package name is `stripe`, not `fivetran/stripe`. ### Forcing Dependencies diff --git a/website/docs/reference/dbt-jinja-functions/return.md b/website/docs/reference/dbt-jinja-functions/return.md index 7878f29aa1c..43bbddfa2d1 100644 --- a/website/docs/reference/dbt-jinja-functions/return.md +++ b/website/docs/reference/dbt-jinja-functions/return.md @@ -1,6 +1,8 @@ --- -title: "return" +title: "About return function" +sidebar_variable: "return" id: "return" +description: "Read this guide to understand the return Jinja function in dbt." --- __Args__: @@ -9,7 +11,7 @@ __Args__: The `return` function can be used in macros to return data to the caller. The type of the data (dict, list, int, etc) will be preserved through the `return` call. - + ```sql {% macro get_data() %} @@ -29,7 +31,7 @@ The `return` function can be used in macros to return data to the caller. The ty select -- getdata() returns a list! - {% for i in getdata() %} + {% for i in get_data() %} {{ i }} {% if not loop.last %},{% endif %} {% endfor %} diff --git a/website/docs/reference/dbt-jinja-functions/run_query.md b/website/docs/reference/dbt-jinja-functions/run_query.md index ad2de09b080..cdd65a7b4dc 100644 --- a/website/docs/reference/dbt-jinja-functions/run_query.md +++ b/website/docs/reference/dbt-jinja-functions/run_query.md @@ -1,9 +1,11 @@ --- -title: "run_query" +title: "About run_query macro" +sidebar_label: "run_query" id: "run_query" +description: "Use `run_query` macro to run queries and fetch results." --- -The `run_query` macro provides a convenient way to run queries and fetch their results. It is a wrapper around the [statement block](statement-blocks), which is more flexible, but also more complicated to use. +The `run_query` macro provides a convenient way to run queries and fetch their results. It is a wrapper around the [statement block](/reference/dbt-jinja-functions/statement-blocks), which is more flexible, but also more complicated to use. __Args__: * `sql`: The SQL query to execute @@ -13,7 +15,7 @@ Returns a [Table](https://agate.readthedocs.io/page/api/table.html) object with **Note:** The `run_query` macro will not begin a transaction automatically - if you wish to run your query inside of a transaction, please use `begin` and `commit ` statements as appropriate. :::info Using run_query for the first time? -Check out the section of the Getting Started guide on [using Jinja](/docs/get-started/learning-more/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro! +Check out the section of the Getting Started guide on [using Jinja](/guides/advanced/using-jinja#dynamically-retrieve-the-list-of-payment-methods) for an example of working with the results of the `run_query` macro! ::: **Example Usage:** @@ -95,3 +97,17 @@ You can also use `run_query` to perform SQL queries that aren't select statement ``` + + +Use the `length` filter to verify whether `run_query` returned any rows or not. Make sure to wrap the logic in an [if execute](/reference/dbt-jinja-functions/execute) block to avoid unexpected behavior during parsing. + +```sql +{% if execute %} +{% set results = run_query(payment_methods_query) %} +{% if results|length > 0 %} + -- do something with `results` here... +{% else %} + -- do fallback here... +{% endif %} +{% endif %} +``` diff --git a/website/docs/reference/dbt-jinja-functions/run_started_at.md b/website/docs/reference/dbt-jinja-functions/run_started_at.md index d11a9abaf19..9dfc83ec56a 100644 --- a/website/docs/reference/dbt-jinja-functions/run_started_at.md +++ b/website/docs/reference/dbt-jinja-functions/run_started_at.md @@ -1,6 +1,8 @@ --- -title: "run_started_at" +title: "About run_started_at variable" +sidebar_label: "run_started_at" id: "run_started_at" +description: "Use `run_started_at` to output the timestamp the run started." --- `run_started_at` outputs the timestamp that this run started, e.g. `2017-04-21 01:23:45.678`. diff --git a/website/docs/reference/dbt-jinja-functions/schema.md b/website/docs/reference/dbt-jinja-functions/schema.md index c49e5616798..8f62a57ae8a 100644 --- a/website/docs/reference/dbt-jinja-functions/schema.md +++ b/website/docs/reference/dbt-jinja-functions/schema.md @@ -1,6 +1,8 @@ --- -title: "schema" +title: "About model schema" +sidebar_label: "schema" id: "schema" +description: "The schema that the model is configured to be materialized in." --- The schema that the model is configured to be materialized in. This is typically the same as `model['schema']`. diff --git a/website/docs/reference/dbt-jinja-functions/schemas.md b/website/docs/reference/dbt-jinja-functions/schemas.md index 4047d0ef856..23c9ee8dee1 100644 --- a/website/docs/reference/dbt-jinja-functions/schemas.md +++ b/website/docs/reference/dbt-jinja-functions/schemas.md @@ -1,6 +1,8 @@ --- -title: "schemas" +title: "About schemas variable" +sidebar_label: "schemas" id: "schemas" +description: "A list of schemas where dbt built objects during the current run." --- `schemas` is a variable available in an `on-run-end` hook, representing a list of schemas that dbt built objects in on this run. diff --git a/website/docs/reference/dbt-jinja-functions/selected_resources.md b/website/docs/reference/dbt-jinja-functions/selected_resources.md index f359a88553c..a927ec317ae 100644 --- a/website/docs/reference/dbt-jinja-functions/selected_resources.md +++ b/website/docs/reference/dbt-jinja-functions/selected_resources.md @@ -1,6 +1,8 @@ --- -title: "selected_resources" +title: "About selected_resources context variable" +sidebar_label: "selected_resources" id: "selected_resources" +description: "Contains a list of all the nodes selected by current dbt command." --- The `selected_resources` context variable contains a list of all the _nodes_ @@ -10,7 +12,7 @@ Currently, this variable is not accessible when using the command `run-operation :::danger Warning! -dbt actively builds the graph during the [parsing phase](execute) of +dbt actively builds the graph during the [parsing phase](/reference/dbt-jinja-functions/execute) of running dbt projects, so the `selected_resources` context variable will be empty during parsing. Please read the information on this page to effectively use this variable. @@ -28,6 +30,8 @@ For a given run it will look like: ["model.my_project.model1", "model.my_project.model2", "snapshot.my_project.my_snapshot"] ``` +Each value corresponds to a key in the `nodes` object within the [graph](/reference/dbt-jinja-functions/graph) context variable. + It can be used in macros in a `pre-hook`, `post-hook`, `on-run-start` or `on-run-end` to evaluate what nodes are selected and trigger different logic whether a particular node is selected or not. @@ -49,7 +53,7 @@ is selected or not. {% do log("model1 is not included based on the current selection", info=true) %} - {% endfor %} + {% endif %} {% endif %} /* @@ -66,4 +70,4 @@ is selected or not. */ ``` - \ No newline at end of file + diff --git a/website/docs/reference/dbt-jinja-functions/set.md b/website/docs/reference/dbt-jinja-functions/set.md index d78f5777828..d85e0539924 100644 --- a/website/docs/reference/dbt-jinja-functions/set.md +++ b/website/docs/reference/dbt-jinja-functions/set.md @@ -1,10 +1,10 @@ --- -title: "set" +title: "About set context method" +sidebar_label: "set" id: "set" +description: "Converts any iterable to a sequence of iterable and unique elements." --- -### set - _Not to be confused with the `{% set foo = "bar" ... %}` expression in Jinja!_ The `set` context method can be used to convert any iterable to a sequence of iterable elements that are unique (a set). diff --git a/website/docs/reference/dbt-jinja-functions/source.md b/website/docs/reference/dbt-jinja-functions/source.md index a49a3b56049..59317a79e3d 100644 --- a/website/docs/reference/dbt-jinja-functions/source.md +++ b/website/docs/reference/dbt-jinja-functions/source.md @@ -1,7 +1,10 @@ --- -title: "source" +title: "About source function" +sidebar_label: "source" id: "source" +description: "Read this guide to understand the source Jinja function in dbt." --- + ```sql select * from {{ source(source_name, table_name) }} ``` @@ -9,10 +12,11 @@ select * from {{ source(source_name, table_name) }} ## Definition This function: -- Returns a [Relation](dbt-classes#relation) for a [source](/docs/build/sources) +- Returns a [Relation](/reference/dbt-classes#relation) for a [source](/docs/build/sources) - Creates dependencies between a source and the current model, which is useful for documentation and model selection - Compiles to the full object name in the database + ## Related guides - [Using sources](/docs/build/sources) @@ -22,7 +26,7 @@ This function: ## Example -Consider a source defined like so: +Consider a source defined as follows: diff --git a/website/docs/reference/dbt-jinja-functions/statement-blocks.md b/website/docs/reference/dbt-jinja-functions/statement-blocks.md index c6de74a0225..2829ad3fe14 100644 --- a/website/docs/reference/dbt-jinja-functions/statement-blocks.md +++ b/website/docs/reference/dbt-jinja-functions/statement-blocks.md @@ -1,8 +1,16 @@ --- -title: "statement blocks" +title: "About statement blocks" +sidebar_label: "statement blocks" id: "statement-blocks" +description: "SQL queries that hit database and return results to your jinja context." --- +:::tip Recommendation + +We recommend using the [`run_query` macro](/reference/dbt-jinja-functions/run_query) instead of `statement` blocks. The `run_query` macro provides a more convenient way to run queries and fetch their results by wrapping `statement` blocks. You can use this macro to write more concise code that is easier to maintain. + +::: + `statement`s are sql queries that hit the database and return results to your Jinja context. Here’s an example of a `statement` which gets all of the states from a users . @@ -29,16 +37,10 @@ __Args__: - `auto_begin` (bool): If True, open a transaction if one does not exist. If false, do not open a transaction. Once the statement block has executed, the result set is accessible via the `load_result` function. The result object includes three keys: -- `response`: Structured object containing metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. Comparable to `adapter_response` in the [Result object](dbt-classes#result-objects). +- `response`: Structured object containing metadata returned from the database, which varies by adapter. E.g. success `code`, number of `rows_affected`, total `bytes_processed`, etc. Comparable to `adapter_response` in the [Result object](/reference/dbt-classes#result-objects). - `data`: Pythonic representation of data returned by query (arrays, tuples, dictionaries). - `table`: [Agate](https://agate.readthedocs.io/page/api/table.html) table representation of data returned by query. - - -* `v0.19.0`: The `response` structured object replaced a `status` string that contained similar information. - - - For the above statement, that could look like: @@ -67,11 +69,3 @@ The contents of the returned `data` field is a matrix. It contains a list rows, ``` - - - -:::danger Volatile API - -While the `statement` and `load_result` setup works for now, we intend to improve this interface in the future. If you have questions or suggestions, please let us know in GitHub or on Slack. - -::: diff --git a/website/docs/reference/dbt-jinja-functions/target.md b/website/docs/reference/dbt-jinja-functions/target.md index a001288c2fb..7d6627c5a4b 100644 --- a/website/docs/reference/dbt-jinja-functions/target.md +++ b/website/docs/reference/dbt-jinja-functions/target.md @@ -1,11 +1,13 @@ --- -title: "target" +title: "About target variable" +sidebar_label: "target" id: "target" +description: "Contains information about your connection to the warehouse." --- `target` contains information about your connection to the warehouse. -* **dbt CLI:** These values are based on the target defined in your [`profiles.yml` file](reference/profiles.yml.md) +* **dbt CLI:** These values are based on the target defined in your [`profiles.yml` file](/docs/core/connect-data-platform/profiles.yml) * **dbt Cloud Scheduler:** * `target.name` is defined per job as described [here](/docs/build/custom-target-names). * For all other attributes, the values are defined by the deployment connection. To check these values, click **Deploy** from the upper left and select **Environments**. Then, select the relevant deployment environment, and click **Settings**. diff --git a/website/docs/reference/dbt-jinja-functions/this.md b/website/docs/reference/dbt-jinja-functions/this.md index 66dbaafa4f2..f9f2961b08f 100644 --- a/website/docs/reference/dbt-jinja-functions/this.md +++ b/website/docs/reference/dbt-jinja-functions/this.md @@ -1,37 +1,26 @@ --- -title: "this" +title: "about this" +sidebar_label: "this" id: "this" +description: "Represents the current model in the database." +keywords: + - relation, relation object, this function, this jinja, this.database, this.schema, this.identifier +meta: + label: 'this' --- `this` is the database representation of the current model. It is useful when: - Defining a `where` statement within [incremental models](/docs/build/incremental-models) -- Using [pre or post hooks](pre-hook-post-hook) +- Using [pre or post hooks](/reference/resource-configs/pre-hook-post-hook) -`this` is a [Relation](dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. +`this` is a [Relation](/reference/dbt-classes#relation), and as such, properties such as `{{ this.database }}` and `{{ this.schema }}` compile as expected. + - Note — Prior to dbt v1.6, the dbt Cloud IDE returns `request` as the result of `{{ ref.identifier }}`. `this` can be thought of as equivalent to `ref('')`, and is a neat way to avoid circular dependencies. ## Examples - - - - -### Grant permissions on a model in a post-hook - - - -```yaml -models: - project-name: - +post-hook: - - "grant select on {{ this }} to db_reader" -``` - - - - - + ### Configuring incremental models @@ -52,3 +41,7 @@ from raw_app_data.events ``` + + + + \ No newline at end of file diff --git a/website/docs/reference/dbt-jinja-functions/thread_id.md b/website/docs/reference/dbt-jinja-functions/thread_id.md new file mode 100644 index 00000000000..84eb32d5bff --- /dev/null +++ b/website/docs/reference/dbt-jinja-functions/thread_id.md @@ -0,0 +1,16 @@ +--- +title: "About thread_id" +sidebar_label: "thread_id" +id: "thread_id" +description: "The `thread_id` outputs an identifier for the current Python thread." +--- + +The `thread_id` outputs an identifier for the current Python thread that is executing a node, like `Thread-1`. + +This value is useful when auditing or analyzing dbt invocation metadata. It corresponds to the `thread_id` within the [`Result` object](/reference/dbt-classes#result-objects) and [`run_results.json`](/reference/artifacts/run-results-json). + +If available, the `thread_id` is: +- available in the compilation context of [`query-comment`](/reference/project-configs/query-comment) +- included in the `info` dictionary in dbt [events and logs](/reference/events-logging#info) +- included in the `metadata` dictionary in [dbt artifacts](/reference/artifacts/dbt-artifacts#common-metadata) +- included as a label in all BigQuery jobs that dbt originates diff --git a/website/docs/reference/dbt-jinja-functions/tojson.md b/website/docs/reference/dbt-jinja-functions/tojson.md index 7cf44f870dd..4225844dd19 100644 --- a/website/docs/reference/dbt-jinja-functions/tojson.md +++ b/website/docs/reference/dbt-jinja-functions/tojson.md @@ -1,12 +1,14 @@ --- -title: "tojson" +title: "About tojson context method" +sidebar_label: "tojson" id: "tojson" +description: "Use this context method to serialize a Python object primitive." --- The `tojson` context method can be used to serialize a Python object primitive, eg. a `dict` or `list` to a JSON string. __Args__: - * `value`: The value serialize to json (required) + * `value`: The value to serialize to json (required) * `default`: A default value to return if the `value` argument cannot be serialized (optional) ### Usage: diff --git a/website/docs/reference/dbt-jinja-functions/toyaml.md b/website/docs/reference/dbt-jinja-functions/toyaml.md index 16efa8aeaa1..93e343be220 100644 --- a/website/docs/reference/dbt-jinja-functions/toyaml.md +++ b/website/docs/reference/dbt-jinja-functions/toyaml.md @@ -1,12 +1,14 @@ --- -title: "toyaml" +title: "About toyaml context method" +sidebar_label: "toyaml" id: "toyaml" +description: "Used to serialize a Python object primitive." --- -The `toyaml` context method can be used to serialize a Python object primitive, eg. a `dict` or `list` to a yaml string. +The `toyaml` context method can be used to serialize a Python object primitive, eg. a `dict` or `list` to a YAML string. __Args__: - * `value`: The value serialize to yaml (required) + * `value`: The value to serialize to YAML (required) * `default`: A default value to return if the `value` argument cannot be serialized (optional) ### Usage: diff --git a/website/docs/reference/dbt-jinja-functions/var.md b/website/docs/reference/dbt-jinja-functions/var.md index 8956cebd6b4..df779f97863 100644 --- a/website/docs/reference/dbt-jinja-functions/var.md +++ b/website/docs/reference/dbt-jinja-functions/var.md @@ -1,6 +1,8 @@ --- -title: "var" +title: "About var function" +sidebar_label: "var" id: "var" +description: "Pass variables from `dbt_project.yml` file into models." --- Variables can be passed from your `dbt_project.yml` file into models during compilation. diff --git a/website/docs/reference/dbt-jinja-functions/zip.md b/website/docs/reference/dbt-jinja-functions/zip.md index 78c92e18925..ec95f7ac407 100644 --- a/website/docs/reference/dbt-jinja-functions/zip.md +++ b/website/docs/reference/dbt-jinja-functions/zip.md @@ -1,11 +1,11 @@ --- -title: "zip" +title: "About zip context method" id: "zip" +sidebar_label: "zip" +description: "Use this context method to return an iterator of tuples." --- -### zip - -The `zip` context method can be used to used to return an iterator of tuples, where the i-th tuple contains the i-th element from each of the argument iterables. ([Python docs](https://docs.python.org/3/library/functions.html#zip)) +The `zip` context method can be used to return an iterator of tuples, where the i-th tuple contains the i-th element from each of the argument iterables. ([Python docs](https://docs.python.org/3/library/functions.html#zip)) :param :param diff --git a/website/docs/reference/dbt_project.yml.md b/website/docs/reference/dbt_project.yml.md index 65855cc8c1c..571e930d7da 100644 --- a/website/docs/reference/dbt_project.yml.md +++ b/website/docs/reference/dbt_project.yml.md @@ -1,10 +1,19 @@ - -- **v1.0.0:** The default config name for `data-paths` is now [`seed-paths`](seed-paths), `source-paths` is now [`model-paths`](model-paths) and `modules-path` is now [`packages-install-path`](packages-install-path). +Every [dbt project](/docs/build/projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate on your project. - + -Every [dbt project](projects) needs a `dbt_project.yml` file — this is how dbt knows a directory is a dbt project. It also contains important information that tells dbt how to operate on your project. +By default, dbt will look for `dbt_project.yml` in your current working directory and its parents, but you can set a different directory using the `--project-dir` flag. + + + + + +By default, dbt will look for `dbt_project.yml` in your current working directory and its parents, but you can set a different directory using the `--project-dir` flag or the `DBT_PROJECT_DIR` environment variable. + +Starting from dbt v1.5 and higher, you can specify your dbt Cloud project ID in the `dbt_project.yml` file using the `dbt-cloud` config, which doesn't require validation or storage in the project config class. To find your project ID, check your dbt Cloud project URL, such as `https://cloud.getdbt.com/11/projects/123456`, where the project ID is `123456`. + + The following is a list of all available configurations in the `dbt_project.yml` file. @@ -12,65 +21,144 @@ The following is a list of all available configurations in the `dbt_project.yml` dbt uses YAML in a few different places. If you're new to YAML, it would be worth taking the time to learn how arrays, dictionaries and strings are represented. ::: + + + ```yml -[name](project-configs/name): string +[name](/reference/project-configs/name): string -[config-version](project-configs/config-version): 2 -[version](project-configs/version): version +[config-version](/reference/project-configs/config-version): 2 +[version](/reference/project-configs/version): version -[profile](project-configs/profile): profilename +[profile](/reference/project-configs/profile): profilename -[model-paths](project-configs/model-paths): [directorypath] -[seed-paths](project-configs/seed-paths): [directorypath] -[test-paths](project-configs/test-paths): [directorypath] -[analysis-paths](project-configs/analysis-paths): [directorypath] -[macro-paths](project-configs/macro-paths): [directorypath] -[snapshot-paths](project-configs/snapshot-paths): [directorypath] -[docs-paths](project-configs/docs-paths): [directorypath] -[asset-paths](project-configs/asset-paths): [directorypath] +[model-paths](/reference/project-configs/model-paths): [directorypath] +[seed-paths](/reference/project-configs/seed-paths): [directorypath] +[test-paths](/reference/project-configs/test-paths): [directorypath] +[analysis-paths](/reference/project-configs/analysis-paths): [directorypath] +[macro-paths](/reference/project-configs/macro-paths): [directorypath] +[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath] +[docs-paths](/reference/project-configs/docs-paths): [directorypath] +[asset-paths](/reference/project-configs/asset-paths): [directorypath] -[target-path](project-configs/target-path): directorypath -[log-path](project-configs/log-path): directorypath -[packages-install-path](project-configs/packages-install-path): directorypath +[target-path](/reference/project-configs/target-path): directorypath +[log-path](/reference/project-configs/log-path): directorypath +[packages-install-path](/reference/project-configs/packages-install-path): directorypath -[clean-targets](project-configs/clean-targets): [directorypath] +[clean-targets](/reference/project-configs/clean-targets): [directorypath] -[query-comment](project-configs/query-comment): string +[query-comment](/reference/project-configs/query-comment): string -[require-dbt-version](project-configs/require-dbt-version): version-range | [version-range] +[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] -[quoting](project-configs/quoting): +[dbt-cloud](/docs/cloud/cloud-cli-installation): + project-id: project_id #Required + defer-env-id: 5678 #Optional + +[quoting](/reference/project-configs/quoting): database: true | false schema: true | false identifier: true | false models: - [](model-configs) + [](/reference/model-configs) seeds: - [](seed-configs) + [](/reference/seed-configs) snapshots: - [](snapshot-configs) + [](/reference/snapshot-configs) sources: [](source-configs) tests: - [](test-configs) + [](/reference/test-configs) vars: [](/docs/build/project-variables) -[on-run-start](project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] -[on-run-end](project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] +[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] +[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] -[dispatch](project-configs/dispatch-config): +[dispatch](/reference/project-configs/dispatch-config): - macro_namespace: packagename search_order: [packagename] +[restrict-access](/docs/collaborate/govern/model-access): true | false + ``` + + + + + + +```yml +[name](/reference/project-configs/name): string + +[config-version](/reference/project-configs/config-version): 2 +[version](/reference/project-configs/version): version + +[profile](/reference/project-configs/profile): profilename + +[model-paths](/reference/project-configs/model-paths): [directorypath] +[seed-paths](/reference/project-configs/seed-paths): [directorypath] +[test-paths](/reference/project-configs/test-paths): [directorypath] +[analysis-paths](/reference/project-configs/analysis-paths): [directorypath] +[macro-paths](/reference/project-configs/macro-paths): [directorypath] +[snapshot-paths](/reference/project-configs/snapshot-paths): [directorypath] +[docs-paths](/reference/project-configs/docs-paths): [directorypath] +[asset-paths](/reference/project-configs/asset-paths): [directorypath] + +[target-path](/reference/project-configs/target-path): directorypath +[log-path](/reference/project-configs/log-path): directorypath +[packages-install-path](/reference/project-configs/packages-install-path): directorypath + +[clean-targets](/reference/project-configs/clean-targets): [directorypath] + +[query-comment](/reference/project-configs/query-comment): string + +[require-dbt-version](/reference/project-configs/require-dbt-version): version-range | [version-range] + +[quoting](/reference/project-configs/quoting): + database: true | false + schema: true | false + identifier: true | false + +models: + [](/reference/model-configs) + +seeds: + [](/reference/seed-configs) + +snapshots: + [](/reference/snapshot-configs) + +sources: + [](source-configs) + +tests: + [](/reference/test-configs) + +vars: + [](/docs/build/project-variables) + +[on-run-start](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] +[on-run-end](/reference/project-configs/on-run-start-on-run-end): sql-statement | [sql-statement] + +[dispatch](/reference/project-configs/dispatch-config): + - macro_namespace: packagename + search_order: [packagename] + +[restrict-access](/docs/collaborate/govern/model-access): true | false + +``` + + + + diff --git a/website/docs/reference/dbtignore.md b/website/docs/reference/dbtignore.md index 268ac855d3a..8733fc592cd 100644 --- a/website/docs/reference/dbtignore.md +++ b/website/docs/reference/dbtignore.md @@ -2,7 +2,7 @@ title: .dbtignore --- -You can create a `.dbtignore` file in the root of your [dbt project](projects) to specify files that should be **entirely** ignored by dbt. The file behaves like a [`.gitignore` file, using the same syntax](https://git-scm.com/docs/gitignore). Files and subdirectories matching the pattern will not be read, parsed, or otherwise detected by dbt—as if they didn't exist. +You can create a `.dbtignore` file in the root of your [dbt project](/docs/build/projects) to specify files that should be **entirely** ignored by dbt. The file behaves like a [`.gitignore` file, using the same syntax](https://git-scm.com/docs/gitignore). Files and subdirectories matching the pattern will not be read, parsed, or otherwise detected by dbt—as if they didn't exist. **Examples** diff --git a/website/docs/reference/events-logging.md b/website/docs/reference/events-logging.md index 4e643e13b01..dec1dafcb8e 100644 --- a/website/docs/reference/events-logging.md +++ b/website/docs/reference/events-logging.md @@ -2,13 +2,9 @@ title: "Events and logs" --- -:::info New in v1.0 -While dbt has always generated logs, the eventing and structured logging system described below is new in v1. -::: - -With every task that dbt performs, it generates events. It records those events as log messages, and writes them (in real time) to two places: +As dbt runs, it generates events. The most common way to see those events is as log messages, written in real time to two places: - The command line terminal (`stdout`), to provide interactive feedback while running dbt. -- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](debugging-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](log-path). +- The debug log file (`logs/dbt.log`), to enable detailed [debugging of errors](/guides/best-practices/debugging-errors) when they occur. The text-formatted log messages in this file include all `DEBUG`-level events, as well as contextual information, such as log level and thread name. The location of this file can be configured via [the `log_path` config](/reference/project-configs/log-path). @@ -21,18 +17,113 @@ With every task that dbt performs, it generates events. It records those events ```text -============================== 2021-12-02 21:29:35.417263 | c83a0afc-7ed3-49e7-8c0e-797af7f9d7b6 ============================== -21:29:35.417263 [info ] [MainThread]: Running with dbt=1.0.0-rc3 -21:29:35.417955 [debug] [MainThread]: running dbt with arguments Namespace(cls=, debug=None, defer=None, exclude=None, fail_fast=None, full_refresh=False, log_cache_events=False, log_format=None, partial_parse=None, printer_width=None, profile=None, profiles_dir='/Users/jerco/.dbt', project_dir=None, record_timing_info=None, rpc_method='run', select=None, selector_name=None, send_anonymous_usage_stats=None, single_threaded=False, state=None, static_parser=None, target=None, threads=None, use_colors=None, use_experimental_parser=None, vars='{}', version_check=None, warn_error=None, which='run', write_json=None) -... -21:29:35.814348 [debug] [Thread-1 ]: On model.my_project.my_table: BEGIN +============================== 21:21:15.272780 | 48cef052-3819-4550-a83a-4a648aef5a31 ============================== +21:21:15.272780 [info ] [MainThread]: Running with dbt=1.5.0-b5 +21:21:15.273802 [debug] [MainThread]: running dbt with arguments {'printer_width': '80', 'indirect_selection': 'eager', 'log_cache_events': 'False', 'write_json': 'True', 'partial_parse': 'True', 'cache_selected_only': 'False', 'warn_error': 'None', 'fail_fast': 'False', 'debug': 'False', 'log_path': '/Users/jerco/dev/scratch/testy/logs', 'profiles_dir': '/Users/jerco/.dbt', 'version_check': 'False', 'use_colors': 'False', 'use_experimental_parser': 'False', 'no_print': 'None', 'quiet': 'False', 'log_format': 'default', 'static_parser': 'True', 'introspect': 'True', 'warn_error_options': 'WarnErrorOptions(include=[], exclude=[])', 'target_path': 'None', 'send_anonymous_usage_stats': 'True'} +21:21:16.190990 [debug] [MainThread]: Partial parsing enabled: 0 files deleted, 0 files added, 0 files changed. +21:21:16.191404 [debug] [MainThread]: Partial parsing enabled, no changes found, skipping parsing +21:21:16.207330 [info ] [MainThread]: Found 2 models, 0 tests, 0 snapshots, 1 analysis, 535 macros, 0 operations, 1 seed file, 0 sources, 0 exposures, 0 metrics, 0 groups + ``` ## Structured logging -When `json` [log formatting](global-configs#log-formatting) is enabled, dbt will output produce rich, structured log information which can be piped into monitoring tools for analysis, or to power applications with dbt metadata in real time. +_For more details about how the eventing system has been implemented in dbt-core, see the [`events` module README](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/events/README.md)._ + + + +Starting in v1.4, the structure of each event in `dbt-core` is backed by a schema defined using [protocol buffers](https://developers.google.com/protocol-buffers). All schemas are defined in the [`types.proto`](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/events/types.proto) file within the `dbt-core` codebase. + +Every event has the same two top-level keys: +- `info`: Information common to all events. See the table below for the breakdown. +- `data`: Additional structured data specific to this event. If this event relates to a specific node within your dbt project, it will contain a `node_info` dictionary with common attributes. + +### `info` fields + +| Field | Description | +|-------------|---------------| +| `category` | Placeholder for future use (see [dbt-labs/dbt-core#5958](https://github.com/dbt-labs/dbt-core/issues/5958)) | +| `code` | Unique shorthand identifier for this event type, e.g. `A123` | +| `extra` | Dictionary of custom environment metadata, based on environment variables prefixed with `DBT_ENV_CUSTOM_ENV_` | +| [`invocation_id`](/reference/dbt-jinja-functions/invocation_id) | A unique identifier for this invocation of dbt | +| `level` | A string representation of the log level (`debug`, `info`, `warn`, `error`) | +| `log_version` | Integer indicating version | +| `msg` | Human-friendly log message, constructed from structured `data`. **Note**: This message is not intended for machine consumption. Log messages are subject to change in future versions of dbt. | +| `name` | Unique name for this event type, matching the proto schema name | +| `pid` | The process ID for the running dbt invocation which produced this log message | +| `thread_name` | The thread in which the log message was produced, helpful for tracking queries when dbt is run with multiple threads | +| `ts` | When the log line was printed | + +### `node_info` fields + +Many events are fired while compiling or running a specific DAG node (model, seed, test, etc). When it's available, the `node_info` object will include: + +| Field | Description | +|-------------|---------------| +| `materialized` | view, table, incremental, etc. | +| `meta` | User-configured [`meta` dictionary](/reference/resource-configs/meta) for this node | +| `node_finished_at` | Timestamp when node processing completed | +| `node_name` | Name of this model/seed/test/etc | +| `node_path` | File path to where this resource is defined | +| `node_relation` | Nested object containing this node's database representation: `database`, `schema`, `alias`, and full `relation_name` with quoting & inclusion policies appliedAdded in v1.5 | +| `node_started_at` | Timestamp when node processing started | +| `node_status` | Current status of the node, either `RunningStatus` (while running) or `NodeStatus` (finished) as defined in [the result contract](https://github.com/dbt-labs/dbt-core/blob/eba90863ed4043957330ea44ca267db1a2d81fcd/core/dbt/contracts/results.py#L75-L88) | +| `resource_type` | `model`, `test`, `seed`, `snapshot`, etc. | +| `unique_id` | The unique identifier for this resource, which can be used to look up more contextual information in the [manifest](/reference/artifacts/manifest-json) | + +### Example + +```json +{ + "data": { + "description": "sql view model dbt_jcohen.my_model", + "index": 1, + "node_info": { + "materialized": "view", + "meta": { + "first": "some_value", + "second": "1234" + }, + "node_finished_at": "", + "node_name": "my_model", + "node_path": "my_model.sql", + "node_relation": { + "alias": "my_model", + "database": "my_database", + "relation_name": "\"my_database\".\"my_schema\".\"my_model\"", + "schema": "my_schema" + }, + "node_started_at": "2023-04-12T19:27:27.435364", + "node_status": "started", + "resource_type": "model", + "unique_id": "model.my_dbt_project.my_model" + }, + "total": 1 + }, + "info": { + "category": "", + "code": "Q011", + "extra": { + "my_custom_env_var": "my_custom_value" + }, + "invocation_id": "206b4e61-8447-4af7-8035-b174ab3ac991", + "level": "info", + "msg": "1 of 1 START sql view model my_database.my_model ................................ [RUN]", + "name": "LogStartLine", + "pid": 95894, + "thread": "Thread-1", + "ts": "2023-04-12T19:27:27.436283Z" + } +} +``` + + + + + +When `json` [log formatting](/reference/global-configs/logs) is enabled, dbt will output produce rich, structured log information which can be piped into monitoring tools for analysis, or to power applications with dbt metadata in real time. Each log line will have the following properties: @@ -40,7 +131,7 @@ Each log line will have the following properties: |-------------|---------------| | `code` | A unique identifier for each event type | | `data` | A dictionary containing programmatically accessible information about the log line. The contents of this dictionary vary based on the event type which generated this log message. | -| [`invocation_id`](invocation_id) | A unique identifier for this invocation of dbt | +| [`invocation_id`](/reference/dbt-jinja-functions/invocation_id) | A unique identifier for this invocation of dbt | | `level` | A string representation of the log level (`debug`, `info`, `warn`, `error`) | | `log_version` | Integer indicating version | | `msg` | The human-friendly log message. **Note**: This message is not intended for machine consumption. Log messages are subject to change in future versions of dbt, and those changes may or may not coincide with a change in `log_version`. | @@ -62,7 +153,7 @@ If available, `node_info` will include: | `node_status` | Current status of the node, as defined in [the result contract](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/contracts/results.py#L61-L74) | | `resource_type` | model, test, seed, snapshot, etc. | | `type` | Always `'node_status'` | -| `unique_id` | The unique identifier for this resource, which can be used to look up contextual information in a [manifest](artifacts/manifest-json) | +| `unique_id` | The unique identifier for this resource, which can be used to look up contextual information in a [manifest](/reference/artifacts/manifest-json) | ### Example @@ -98,14 +189,20 @@ If available, `node_info` will include: } ``` + + ## Python interface -`dbt-core` makes available a full history of events fired during an invocation, in the form of an `EVENT_HISTORY` object: +:::warning +The `EVENT_HISTORY` object has been deprecated and removed in dbt Core v1.4+ +::: -```python -from dbt.events.functions import EVENT_HISTORY -``` +Older versions of `dbt-core` made available a full history of events fired during an invocation, in the form of an `EVENT_HISTORY` object. + + + +When [invoking dbt programmatically](programmatic-invocations#registering-callbacks), it is possible to register a callback on dbt's `EventManager`. This allows access to structured events as Python objects, to enable custom logging and integration with other systems. -The Python interface into events is significantly less mature than the structured logging interface. For all use cases, we recommend parsing JSON-formatted logs. + -For details about how the eventing system has been implemented in dbt-core, see the [`events` module README](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/events/README.md). +The Python interface into events is significantly less mature than the structured logging interface. For all standard use cases, we recommend parsing JSON-formatted logs. diff --git a/website/docs/reference/exposure-properties.md b/website/docs/reference/exposure-properties.md index cee5271ac26..aaed2a20a09 100644 --- a/website/docs/reference/exposure-properties.md +++ b/website/docs/reference/exposure-properties.md @@ -1,16 +1,11 @@ --- title: Exposure properties +description: "Read this guide to understand exposure properties in dbt." --- - - -* Exposures are new in `v0.18.1` - - - ## Related documentation -- [Using exposures](exposures) -- [Declaring resource properties](configs-and-properties) +- [Using exposures](/docs/build/exposures) +- [Declaring resource properties](/reference/configs-and-properties) ## Overview Exposures are defined in `.yml` files nested under an `exposures:` key. You may define `exposures` in YAML files that also define define `sources` or `models`. @@ -23,6 +18,8 @@ Exposure names must contain only letters, numbers, and underscores (no spaces or
          + + ```yml @@ -30,12 +27,47 @@ version: 2 exposures: - name: - [description](description): + [description](/reference/resource-properties/description): type: {dashboard, notebook, analysis, ml, application} url: maturity: {high, medium, low} - [tags](resource-configs/tags): [] - [meta](resource-configs/meta): {} + [tags](/reference/resource-configs/tags): [] + [meta](/reference/resource-configs/meta): {} + owner: + name: + email: + + depends_on: + - ref('model') + - ref('seed') + - source('name', 'table') + - metric('metric_name') + + label: "Human-Friendly Name for this Exposure!" + [config](/reference/resource-properties/config): + enabled: true | false + + - name: ... # declare properties of additional exposures +``` + + + + + + + + +```yml +version: 2 + +exposures: + - name: + [description](/reference/resource-properties/description): + type: {dashboard, notebook, analysis, ml, application} + url: + maturity: {high, medium, low} + [tags](/reference/resource-configs/tags): [] + [meta](/reference/resource-configs/meta): {} owner: name: email: @@ -47,21 +79,24 @@ exposures: # added in dbt Core v1.3 label: "Human-Friendly Name for this Exposure!" - [config](resource-properties/config): + [config](/reference/resource-properties/config): enabled: true | false - name: ... # declare properties of additional exposures ``` + ## Example ```yaml +version: 2 + exposures: - + - name: weekly_jaffle_metrics label: Jaffles by the Week # optional, new in dbt Core v1.3 type: dashboard # required @@ -69,15 +104,17 @@ exposures: url: https://bi.tool/dashboards/1 # optional description: > # optional Did someone say "exponential growth"? - + depends_on: # expected - ref('fct_orders') - ref('dim_customers') - source('gsheets', 'goals') + - metric('count_orders') owner: - name: Claire from Data # optional - email: data@jaffleshop.com # required + name: Callum McData + email: data@jaffleshop.com + - name: jaffle_recommender diff --git a/website/docs/reference/global-cli-flags.md b/website/docs/reference/global-cli-flags.md index a6bc4ee9de0..1604bc545b5 100644 --- a/website/docs/reference/global-cli-flags.md +++ b/website/docs/reference/global-cli-flags.md @@ -5,7 +5,7 @@ id: "global-cli-flags" dbt commands, such as `run` or `test`, support their own command-specific CLI flags. In addition, the dbt command itself supports "global" flags applicable to *all* subcommands. -As of v1.0, the vast majority of global CLI flags map to [global configs](global-configs), which can also be configured via environment variables or in the `profiles.yml`. +As of v1.0, the vast majority of global CLI flags map to [global configs](/reference/global-configs/about-global-configs), which can also be configured via environment variables or in the `profiles.yml`. The `--version` and `--record-timing-info` flags remain available to the CLI only. diff --git a/website/docs/reference/global-configs.md b/website/docs/reference/global-configs.md deleted file mode 100644 index 4afc5cf09b1..00000000000 --- a/website/docs/reference/global-configs.md +++ /dev/null @@ -1,421 +0,0 @@ ---- -title: "Global Configs" -id: "global-configs" ---- - -## About Global Configs - -Global configs enable you to fine-tune _how_ dbt runs projects on your machine—whether your personal laptop, an orchestration tool running remotely, or (in some cases) dbt Cloud. In general, they differ from most [project configs](reference/dbt_project.yml) and [resource configs](reference/configs-and-properties), which tell dbt _what_ to run. - -Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. These configs are "global" because they are available for all dbt commands, and because they can be set for all projects running on the same machine or in the same environment. - -Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last yaml configs (usually `profiles.yml`). - -## Command line flags - -Command line (CLI) flags immediately follow `dbt` and precede your subcommand. When set, CLI flags override environment variables and profile configs. - -Use this non-boolean config structure, replacing `` with the config you are enabling or disabling, `` with the new setting for the config, and `` with the command this config applies to: - - - - -```text - -$ --= - -``` - - - -Non-boolean config examples: - - - - -```text - -$ dbt --printer-width=80 run -$ dbt --indirect-selection=eager test - -``` - - - -To turn on boolean configs, you would use the `--` CLI flag, and a `--no-` CLI flag to turn off boolean configs, replacing `` with the config you are enabling or disabling and `` with the command this config applies to. - -Boolean config structure: - - - - -```text -$ dbt -- -$ dbt --no- - -``` - - - -Boolean config example: - - - - -```text - -$ dbt --version-check run -$ dbt --no-version-check run - -``` - - - -## Environment variables - -Environment variables contain a `DBT_` prefix - - - -```text - -$ export DBT_=True -$ dbt run - -``` - - - -## Yaml configurations - -For most global configurations, you can set "user profile" configurations in the `config:` block of `profiles.yml`. This style of configuration sets default values for all projects using this profile directory—usually, all projects running on your local machine. - - - -```yaml - -config: - : true - -``` - - - - - -The exception: Some global configurations are actually set in `dbt_project.yml`, instead of `profiles.yml`, because they control where dbt places logs and artifacts. Those file paths are always relative to the location of `dbt_project.yml`. For more details, see ["Log and target paths"](#log-and-target-paths) below. - - - - - -### Cache database objects for selected resource - -:::caution Experimental config flag -This should be considered an _unstable_ config flag because it is experimental and subject to change. We reserve the right to make breaking changes to this config flag. -::: - -At the start of runs, dbt caches metadata about all the objects in all the schemas where it might materialize resources (such as models). By default, dbt caches all schemas related to the project. When this config is enabled, dbt will only cache schemas related to selected resources for the current run. This can offer significant speed improvements when running a small subset of a large project. - -For example, to improve speed and performance while focused on developing Salesforce models, which are materialized into their own dedicated schema, you would select those models and pass the `cache-selected-only` flag: - -```text - -$ dbt --cache-selected-only run --select salesforce - -``` - -The previous example enables you to start working with your salesforce models, and dbt will only cache those models instead of the entire project. - -To set the config in your `profile.yml`: - - -```yaml - -config: - cache_selected_only: true - -``` - - - - - -### Checking version compatibility - -Projects are recommended to set [dbt version requirements](require-dbt-version), especially if they use features that are newer, or which may break in future versions of dbt Core. By default, if you run a project with an incompatible dbt version, dbt will raise an error. - -You can use the `VERSION_CHECK` config to disable this check and suppress the error message: - -``` -$ dbt --no-version-check run -Running with dbt=1.0.0 -Found 13 models, 2 tests, 1 archives, 0 analyses, 204 macros, 2 operations.... -``` - -### Debug-level logging - -The `DEBUG` config redirects dbt's debug logs to standard out. The has the effect of showing debug-level log information in the terminal in addition to the `logs/dbt.log` file. This output is verbose. - -The `--debug` flag is also available via shorthand as `-d`. - - - -```text -$ dbt --debug run -... - -``` - - - -### Experimental parser - -With the `USE_EXPERIMENTAL_PARSER` config, you can opt into the latest and greatest experimental version of the static parser, which is still being sampled for 100% correctness. See [the docs on parsing](parsing#experimental-parser) for more details. - - - -```yaml - -config: - use_experimental_parser: true - -``` - - - -### Failing fast - -Supply the `-x` or `--fail-fast` flag to `dbt run` to make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models. - -For example, you can select four models to run, but if a failure occurs in the first model, the failure will prevent other models from running: - -```text -$ dbt -x run --threads 1 -Running with dbt=1.0.0 -Found 4 models, 1 test, 1 snapshot, 2 analyses, 143 macros, 0 operations, 1 seed file, 0 sources - -14:47:39 | Concurrency: 1 threads (target='dev') -14:47:39 | -14:47:39 | 1 of 4 START table model test_schema.model_1........... [RUN] -14:47:40 | 1 of 4 ERROR creating table model test_schema.model_1.. [ERROR in 0.06s] -14:47:40 | 2 of 4 START view model test_schema.model_2............ [RUN] -14:47:40 | CANCEL query model.debug.model_2....................... [CANCEL] -14:47:40 | 2 of 4 ERROR creating view model test_schema.model_2... [ERROR in 0.05s] - -Database Error in model model_1 (models/model_1.sql) - division by zero - compiled SQL at target/run/debug/models/model_1.sql - -Encountered an error: -FailFast Error in model model_1 (models/model_1.sql) - Failing early due to test failure or runtime error -``` - -### Log Formatting - -The `LOG_FORMAT` config specifies how dbt's logs should be formatted. If the value of this config is `json`, dbt will output fully structured logs in format; otherwise, it will output text-formatted logs that are sparser for the CLI and more detailed in `logs/dbt.log`. - - - -```text -$ dbt --log-format json run -{"code": "A001", "data": {"v": "=1.0.0"}, "invocation_id": "1193e449-4b7a-4eb1-8e8e-047a8b3b7973", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.0", "node_info": {}, "pid": 35098, "thread_name": "MainThread", "ts": "2021-12-03T10:46:59.928217Z", "type": "log_line"} -``` - -:::tip Tip: verbose structured logs - -Use `json` formatting value in conjunction with the `DEBUG` config to produce rich log information which can be piped into monitoring tools for analysis: - -```text -$ dbt --debug --log-format json run -``` - -See [structured logging](events-logging#structured-logging) for more details. - -::: - - - -### Partial Parsing - -The `PARTIAL_PARSE` config can turn partial parsing on or off in your project. See [the docs on parsing](parsing#partial-parsing) for more details. - - - -```yaml - -config: - partial_parse: true - -``` - - - - - -```text -dbt --no-partial-parse run -``` - - - -### Printer width - -By default, dbt will print out lines padded to 80 characters wide. You can change this setting by adding the following to your `profiles.yml` file: - - - -```yaml -config: - printer_width: 120 -``` - - - - - -### Log and target paths - -By default, dbt will write logs to a directory named `logs/`, and all other artifacts to a directory named `target/`. Both of those directories are located relative to `dbt_project.yml` of the active project—that is, the root directory from which dbt is run. - -Just like other global configs, it is possible to override these values for your environment or invocation by using CLI flags (`--target-path`, `--log-path`) or environment variables (`DBT_TARGET_PATH`, `DBT_LOG_PATH`). - -Unlike the other global configs documented on this page, which can be set in `profiles.yml`, the project paths are configured in `dbt_project.yml`. This is because `profiles.yml` and `dbt_project.yml` are most often located in separate file systems on your machine, and the log and artifact paths are always defined relative to the location of `dbt_project.yml`. - - - -```yaml -[target-path](target-path): "other-target" -[log-path](log-path): "other-logs" -``` - - - - - -### Send anonymous usage stats - -We want to build the best version of dbt possible, and a crucial part of that is understanding how users work with dbt. To this end, we've added some simple event tracking to dbt (using Snowplow). We do not track credentials, raw model contents or model names (we consider these private, and frankly none of our business). - -Usage statistics are fired when dbt is invoked and when models are run. These events contain basic platform information (OS + python version) and metadata such as whether the invocation succeeded, how long it took, an anonymized hash key representing the raw model content, and number of nodes that were run. You can see all the event definitions in [`tracking.py`](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/tracking.py). - -By default this is turned on – you can opt out of event tracking at any time by adding the following to your `profiles.yml` file: - -```yaml -config: - send_anonymous_usage_stats: False -``` - -You can also use the DO_NOT_TRACK environmental variable to enable or disable sending anonymous data. For more information, see [Environmental variables](/docs/build/environment-variables). - -`DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False` -`DO_NOT_TRACK=0` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=True` - -### Static parser - -The `STATIC_PARSER` config can enable or disable use of the static parser. See [the docs on parsing](parsing#static-parser) for more details. - - - -```yaml - -config: - static_parser: true - -``` - - - -### Strict - -As of v1.0, the `-S` or `--strict` flag has been deprecated. - - - -### Suppress non-error logs in output - -By default, dbt shows all logs in standard out (stdout). You can use the `QUIET` config to show only error logs in stdout. Logs will still include the output of anything passed to the `print()` macro. For example, you might suppress all but error logs to more easily find and debug a jinja error. - - - -```yaml -config: - quiet: true -``` - - - -Supply the `-q` or `--quiet` flag to `dbt run` to show only error logs and suppress non-error logs. - -```text -$ dbt --quiet run -... - -``` - -### Suppress `print()` messages in stdout - -By default, dbt includes `print()` messages in standard out (stdout). You can use the `NO_PRINT` config to prevent these messages from showing up in stdout. - - - -```yaml -config: - no_print: true -``` - - - -Supply `--no-print` flag to `dbt run` to suppress `print()` messages from showing in stdout. - -```text -$ dbt --no-print run -... - -``` - - - -### Use colors - -By default, dbt will colorize the output it prints in your terminal. You can turn this off by adding the following to your `profiles.yml` file: - - - -```yaml -config: - use_colors: False -``` - -```text -$ dbt --use-colors run -$ dbt --no-use-colors run -``` - - - -### Warnings as Errors - -Turning on the `WARN_ERROR` config will convert dbt warnings into errors. Any time dbt would normally warn, it will instead raise an error. Examples include `--select` criteria that selects no resources, deprecations, configurations with no associated models, invalid test configurations, or tests and freshness checks that are configured to return warnings. - - - -```text -$ dbt --warn-error run -... -``` - - - -### Writing JSON artifacts - -The `WRITE_JSON` config determines whether dbt writes JSON artifacts (eg. `manifest.json`, `run_results.json`) to the `target/` directory. JSON serialization can be slow, and turning this flag off _might_ make invocations of dbt faster. Alternatively, you might disable this config if you want to perform a dbt operation and avoid overwriting artifacts from a previous run step. - - - -```text -dbt --no-write-json run -``` - - diff --git a/website/docs/reference/global-configs/about-global-configs.md b/website/docs/reference/global-configs/about-global-configs.md new file mode 100644 index 00000000000..42819cdac8f --- /dev/null +++ b/website/docs/reference/global-configs/about-global-configs.md @@ -0,0 +1,11 @@ +--- +title: "About global configs" +id: "about-global-configs" +sidebar: "About global configs" +--- + +Global configs enable you to fine-tune _how_ dbt runs projects on your machine—whether your personal laptop, an orchestration tool running remotely, or (in some cases) dbt Cloud. In general, they differ from most [project configs](/reference/dbt_project.yml) and [resource configs](/reference/configs-and-properties), which tell dbt _what_ to run. + +Global configs control things like the visual output of logs, the manner in which dbt parses your project, and what to do when dbt finds a version mismatch or a failing model. These configs are "global" because they are available for all dbt commands, and because they can be set for all projects running on the same machine or in the same environment. + +Starting in v1.0, you can set global configs in three places. When all three are set, command line flags take precedence, then environment variables, and last yaml configs (usually `profiles.yml`). \ No newline at end of file diff --git a/website/docs/reference/global-configs/cache.md b/website/docs/reference/global-configs/cache.md new file mode 100644 index 00000000000..a605e1e70f3 --- /dev/null +++ b/website/docs/reference/global-configs/cache.md @@ -0,0 +1,66 @@ +--- +title: "Cache" +id: "cache" +sidebar: "Cache" +--- + + + +### Cache population + +At the start of runs, dbt caches metadata about all the objects in all the schemas where it might materialize resources (such as models). By default, dbt populates the cache with information on all schemas related to the project. + +There are two ways to optionally modify this behavior: +- `POPULATE_CACHE` (default: `True`): Whether to populate the cache at all. To skip cache population entirely, use the `--no-populate-cache` flag or `DBT_POPULATE_CACHE: False`. Note that this does not _disable_ the cache; missed cache lookups will run queries, and update the cache afterward. +- `CACHE_SELECTED_ONLY` (default `False`): Whether to limit cache population to just the resources selected in the current run. This can offer significant speed improvements when running a small subset of a large project, while still providing the benefit of caching upfront. + +For example, to quickly compile a model that requires no database metadata or introspective queries: +```text + +dbt --no-populate-cache compile --select my_model_name + +``` + +Or, to improve speed and performance while focused on developing Salesforce models, which are materialized into their own dedicated schema, you could select those models and pass the `cache-selected-only` flag: + +```text + +dbt --cache-selected-only run --select salesforce + +``` + + + + + +### Cache database objects for selected resource + +:::caution Experimental config flag +This should be considered an _unstable_ config flag because it is experimental and subject to change. We reserve the right to make breaking changes to this config flag. +::: + +At the start of runs, dbt caches metadata about all the objects in all the schemas where it might materialize resources (such as models). By default, dbt caches all schemas related to the project. When this config is enabled, dbt will only cache schemas related to selected resources for the current run. This can offer significant speed improvements when running a small subset of a large project. + +For example, to improve speed and performance while focused on developing Salesforce models, which are materialized into their own dedicated schema, you would select those models and pass the `cache-selected-only` flag: + +```text + +dbt --cache-selected-only run --select salesforce + +``` + +The previous example enables you to start working with your salesforce models, and dbt will only cache those models instead of the entire project. + +To set the config in your `profile.yml`: + + +```yaml + +config: + cache_selected_only: true + +``` + + + + diff --git a/website/docs/reference/global-configs/command-line-flags.md b/website/docs/reference/global-configs/command-line-flags.md new file mode 100644 index 00000000000..fbe89ce28f1 --- /dev/null +++ b/website/docs/reference/global-configs/command-line-flags.md @@ -0,0 +1,98 @@ +--- +title: "Command line flags" +id: "command-line-flags" +sidebar: "Command line flags" +--- + +For consistency, command-line interface (CLI) flags should come right after the `dbt` prefix and its subcommands. This includes "global" flags (supported for all commands). When set, CLI flags override environment variables and profile configs. + +For example, instead of using: + +```bash +dbt --no-populate-cache run +``` + +You should use: + +```bash +dbt run --no-populate-cache +``` + +Historically, passing flags (such as "global flags") _before_ the subcommand is a legacy functionality that dbt Labs can remove at any time. We do not support using the same flag before and after the subcommand. + +## Using boolean and non-boolean flags + +You can construct your commands with boolean flags to enable or disable or with non-boolean flags that use specific values, such as strings. + + + + + +Use this non-boolean config structure: +- Replacing `` with the command this config applies to. +- `` with the config you are enabling or disabling, and +- `` with the new setting for the config. + + + + +```text + + --= + +``` + + + +### Example + + + + +```text + +dbt run --printer-width=80 +dbt test --indirect-selection=eager + +``` + + + + + + + +To enable or disable boolean configs: +- Use `` this config applies to. +- Followed by `--` to turn it on, or `--no-` to turn it off. +- Replace `` with the config you are enabling or disabling + + + + + +```text +dbt -- +dbt --no- + +``` + + + +### Example + + + + +```text + +dbt run --version-check +dbt run --no-version-check + +``` + + + + + + diff --git a/website/docs/reference/global-configs/environment-variable-configs.md b/website/docs/reference/global-configs/environment-variable-configs.md new file mode 100644 index 00000000000..e7596835409 --- /dev/null +++ b/website/docs/reference/global-configs/environment-variable-configs.md @@ -0,0 +1,20 @@ +--- +title: "Environment variable configs" +id: "environment-variable-configs" +sidebar: "Environment variable configs" +--- + +Environment variables contain a `DBT_` prefix + + + +```text + +$ export DBT_=True +dbt run + +``` + + + +For more detailed information, read our [environment variables page](/docs/build/environment-variables) \ No newline at end of file diff --git a/website/docs/reference/global-configs/failing-fast.md b/website/docs/reference/global-configs/failing-fast.md new file mode 100644 index 00000000000..2b982c9665a --- /dev/null +++ b/website/docs/reference/global-configs/failing-fast.md @@ -0,0 +1,31 @@ +--- +title: "Failing fast" +id: "failing-fast" +sidebar: "Failing fast" +--- + +Supply the `-x` or `--fail-fast` flag to `dbt run` to make dbt exit immediately if a single resource fails to build. If other models are in-progress when the first model fails, then dbt will terminate the connections for these still-running models. + +For example, you can select four models to run, but if a failure occurs in the first model, the failure will prevent other models from running: + +```text +dbt -x run --threads 1 +Running with dbt=1.0.0 +Found 4 models, 1 test, 1 snapshot, 2 analyses, 143 macros, 0 operations, 1 seed file, 0 sources + +14:47:39 | Concurrency: 1 threads (target='dev') +14:47:39 | +14:47:39 | 1 of 4 START table model test_schema.model_1........... [RUN] +14:47:40 | 1 of 4 ERROR creating table model test_schema.model_1.. [ERROR in 0.06s] +14:47:40 | 2 of 4 START view model test_schema.model_2............ [RUN] +14:47:40 | CANCEL query model.debug.model_2....................... [CANCEL] +14:47:40 | 2 of 4 ERROR creating view model test_schema.model_2... [ERROR in 0.05s] + +Database Error in model model_1 (models/model_1.sql) + division by zero + compiled SQL at target/run/debug/models/model_1.sql + +Encountered an error: +FailFast Error in model model_1 (models/model_1.sql) + Failing early due to test failure or runtime error +``` diff --git a/website/docs/reference/global-configs/json-artifacts.md b/website/docs/reference/global-configs/json-artifacts.md new file mode 100644 index 00000000000..d8948b20d8e --- /dev/null +++ b/website/docs/reference/global-configs/json-artifacts.md @@ -0,0 +1,15 @@ +--- +title: "JSON artifacts" +id: "json-artifacts" +sidebar: "JSON artifacts" +--- + +The `WRITE_JSON` config determines whether dbt writes JSON artifacts (eg. `manifest.json`, `run_results.json`) to the `target/` directory. JSON serialization can be slow, and turning this flag off _might_ make invocations of dbt faster. Alternatively, you might disable this config if you want to perform a dbt operation and avoid overwriting artifacts from a previous run step. + + + +```text +dbt --no-write-json run +``` + + diff --git a/website/docs/reference/global-configs/logs.md b/website/docs/reference/global-configs/logs.md new file mode 100644 index 00000000000..8c819193fc6 --- /dev/null +++ b/website/docs/reference/global-configs/logs.md @@ -0,0 +1,144 @@ +--- +title: "Logs" +id: "logs" +sidebar: "logs" +--- + +### Log Formatting + +The `LOG_FORMAT` config specifies how dbt's logs should be formatted. If the value of this config is `json`, dbt will output fully structured logs in format; otherwise, it will output text-formatted logs that are sparser for the CLI and more detailed in `logs/dbt.log`. + + + +```text +dbt --log-format json run +{"code": "A001", "data": {"v": "=1.0.0"}, "invocation_id": "1193e449-4b7a-4eb1-8e8e-047a8b3b7973", "level": "info", "log_version": 1, "msg": "Running with dbt=1.0.0", "node_info": {}, "pid": 35098, "thread_name": "MainThread", "ts": "2021-12-03T10:46:59.928217Z", "type": "log_line"} +``` + + + + + +To set the `LOG_FORMAT_FILE` type output for the file without impacting the console log format, use the `log-format-file` flag. + + +```text +dbt --log-format-file json run +``` + + + +:::tip Tip: verbose structured logs + +Use `json` formatting value in conjunction with the `DEBUG` config to produce rich log information which can be piped into monitoring tools for analysis: + +```text +dbt --debug --log-format json run +``` + +See [structured logging](/reference/events-logging#structured-logging) for more details. + +::: + + + +### Log Level + +The `LOG_LEVEL` config sets the minimum severity of events captured in the console and file logs. This is a more flexible alternative to the `--debug` flag. The available options for the log levels are `debug`, `info`, `warn`, `error`, or `none`. + +Setting the `--log-level` will configure console and file logs. + + +```text +dbt --log-level debug run +``` + +To set the file log level as a different value than the console, use the `--log-level-file` flag. + + +```text +dbt --log-level-file error run +``` + + + + +### Debug-level logging + +The `DEBUG` config redirects dbt's debug logs to standard output. This has the effect of showing debug-level log information in the terminal in addition to the `logs/dbt.log` file. This output is verbose. + +The `--debug` flag is also available via shorthand as `-d`. + + + +```text +dbt --debug run +... + +``` + + + + + +### Log and target paths + +By default, dbt will write logs to a directory named `logs/`, and all other artifacts to a directory named `target/`. Both of those directories are located relative to `dbt_project.yml` of the active project—that is, the root directory from which dbt is run. + +Just like other global configs, it is possible to override these values for your environment or invocation by using CLI flags (`--target-path`, `--log-path`) or environment variables (`DBT_TARGET_PATH`, `DBT_LOG_PATH`). + +Unlike the other global configs documented on this page, which can be set in `profiles.yml`, the project paths are configured in `dbt_project.yml`. This is because `profiles.yml` and `dbt_project.yml` are most often located in separate file systems on your machine, and the log and artifact paths are always defined relative to the location of `dbt_project.yml`. + + + +```yaml +[target-path](target-path): "other-target" +[log-path](log-path): "other-logs" +``` + + + + + +### Suppress non-error logs in output + +By default, dbt shows all logs in standard out (stdout). You can use the `QUIET` config to show only error logs in stdout. Logs will still include the output of anything passed to the `print()` macro. For example, you might suppress all but error logs to more easily find and debug a jinja error. + + + +```yaml +config: + quiet: true +``` + + + +Supply the `-q` or `--quiet` flag to `dbt run` to show only error logs and suppress non-error logs. + +```text +dbt --quiet run +... + +``` + + + +### Color + +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + + +```text +dbt --use-colors-file run +dbt --no-use-colors-file run +``` + + diff --git a/website/docs/reference/global-configs/parsing.md b/website/docs/reference/global-configs/parsing.md new file mode 100644 index 00000000000..b8fbf432652 --- /dev/null +++ b/website/docs/reference/global-configs/parsing.md @@ -0,0 +1,58 @@ +--- +title: "Parsing" +id: "parsing" +sidebar: "Parsing" +--- + +### Partial Parsing + +The `PARTIAL_PARSE` config can turn partial parsing on or off in your project. See [the docs on parsing](/reference/parsing#partial-parsing) for more details. + + + +```yaml + +config: + partial_parse: true + +``` + + + + + +```text +dbt --no-partial-parse run +``` + + + +### Static parser + +The `STATIC_PARSER` config can enable or disable the use of the static parser. See [the docs on parsing](/reference/parsing#static-parser) for more details. + + + +```yaml + +config: + static_parser: true + +``` + + + +### Experimental parser + +With the `USE_EXPERIMENTAL_PARSER` config, you can opt into the latest and greatest experimental version of the static parser, which is still being sampled for 100% correctness. See [the docs on parsing](/reference/parsing#experimental-parser) for more details. + + + +```yaml + +config: + use_experimental_parser: true + +``` + + diff --git a/website/docs/reference/global-configs/print-output.md b/website/docs/reference/global-configs/print-output.md new file mode 100644 index 00000000000..112b92b546f --- /dev/null +++ b/website/docs/reference/global-configs/print-output.md @@ -0,0 +1,101 @@ +--- +title: "Print output" +id: "print-output" +sidebar: "Print output" +--- + +### Suppress `print()` messages in stdout + + + +By default, dbt includes `print()` messages in standard out (stdout). You can use the `NO_PRINT` config to prevent these messages from showing up in stdout. + + + +```yaml +config: + no_print: true +``` + + + + + + + +By default, dbt includes `print()` messages in standard out (stdout). You can use the `PRINT` config to prevent these messages from showing up in stdout. + + + +```yaml +config: + print: false +``` + + + +:::warning Syntax deprecation + +The original `NO_PRINT` syntax has been deprecated, starting with dbt v1.5. Backward compatibility is supported but will be removed in an as-of-yet-undetermined future release. + +::: + + + +Supply `--no-print` flag to `dbt run` to suppress `print()` messages from showing in stdout. + +```text +dbt --no-print run +... + +``` + +### Printer width + +By default, dbt will print out lines padded to 80 characters wide. You can change this setting by adding the following to your `profiles.yml` file: + + + +```yaml +config: + printer_width: 120 +``` + + + +### Print color + +By default, dbt will colorize the output it prints in your terminal. You can turn this off by adding the following to your `profiles.yml` file: + + + +```yaml +config: + use_colors: False +``` + + + +```text +dbt --use-colors run +dbt --no-use-colors run +``` + + +You can set the color preferences for the file logs only within `profiles.yml` or using the `--use-colors-file / --no-use-colors-file` flags. + + + +```yaml +config: + use_colors_file: False +``` + + + +```text +dbt --use-colors-file run +dbt --no-use-colors-file run +``` + + diff --git a/website/docs/reference/global-configs/usage-stats.md b/website/docs/reference/global-configs/usage-stats.md new file mode 100644 index 00000000000..1f9492f4a43 --- /dev/null +++ b/website/docs/reference/global-configs/usage-stats.md @@ -0,0 +1,21 @@ +--- +title: "Anonymous usage stats" +id: "usage-stats" +sidebar: "Anonymous usage stats" +--- + +We want to build the best version of dbt possible, and a crucial part of that is understanding how users work with dbt. To this end, we've added some simple event tracking to dbt (using Snowplow). We do not track credentials, raw model contents or model names (we consider these private, and frankly none of our business). Some possible use cases for usage stats might be industry identification, use-case research, sales/marketing, product, services and/or feature improvement purposes. + +Usage statistics are fired when dbt is invoked and when models are run. These events contain basic platform information (OS + python version) and metadata such as whether the invocation succeeded, how long it took, an anonymized hash key representing the raw model content, and number of nodes that were run. You can see all the event definitions in [`tracking.py`](https://github.com/dbt-labs/dbt-core/blob/HEAD/core/dbt/tracking.py). + +By default, this is enabled. dbt Core users can opt out of event tracking at any time by adding the following to your `profiles.yml` file: + +```yaml +config: + send_anonymous_usage_stats: False +``` + +dbt Core users can also use the DO_NOT_TRACK environment variable to enable or disable sending anonymous data. For more information, see [Environment variables](/docs/build/environment-variables). + +`DO_NOT_TRACK=1` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=False` +`DO_NOT_TRACK=0` is the same as `DBT_SEND_ANONYMOUS_USAGE_STATS=True` diff --git a/website/docs/reference/global-configs/version-compatibility.md b/website/docs/reference/global-configs/version-compatibility.md new file mode 100644 index 00000000000..c7a1227adaf --- /dev/null +++ b/website/docs/reference/global-configs/version-compatibility.md @@ -0,0 +1,15 @@ +--- +title: "Checking version compatibility" +id: "version-compatibility" +sidebar: "Version compatibility" +--- + +Projects are recommended to set [dbt version requirements](/reference/project-configs/require-dbt-version), especially if they use features that are newer, or which may break in future versions of dbt Core. By default, if you run a project with an incompatible dbt version, dbt will raise an error. + +You can use the `VERSION_CHECK` config to disable this check and suppress the error message: + +``` +dbt --no-version-check run +Running with dbt=1.0.0 +Found 13 models, 2 tests, 1 archives, 0 analyses, 204 macros, 2 operations.... +``` \ No newline at end of file diff --git a/website/docs/reference/global-configs/warnings.md b/website/docs/reference/global-configs/warnings.md new file mode 100644 index 00000000000..967f2209d44 --- /dev/null +++ b/website/docs/reference/global-configs/warnings.md @@ -0,0 +1,63 @@ +--- +title: "Warnings" +id: "warnings" +sidebar: "Warnings" +--- + +Turning on the `WARN_ERROR` config will convert dbt warnings into errors. Any time dbt would normally warn, it will instead raise an error. Examples include `--select` criteria that selects no resources, deprecations, configurations with no associated models, invalid test configurations, or tests and freshness checks that are configured to return warnings. + + + +```text +dbt --warn-error run +... +``` + + + + +Converting any and all warnings to errors may suit your needs perfectly, but there may be some warnings you just don't care about, and some you care about a lot. + +The `WARN_ERROR_OPTIONS` config gives you more granular control over _exactly which types of warnings_ are treated as errors. Warnings that should be treated as errors can be specified through `include` and/or `exclude` parameters. Warning names can be found in [dbt-core's types.py file](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/events/types.py), where each class name that inherits from `WarnLevel` corresponds to a warning name (e.g. `AdapterDeprecationWarning`, `NoNodesForSelectionCriteria`). + +The `include` parameter can set to `"all"` or `"*"` to treat all warnings as exceptions, or to a list of specific warning names to treat as exceptions. When include is set to `"all"` or `"*"`, the optional `exclude` parameter can be set to exclude specifc warnings from being treated as exceptions. + +:::info `WARN_ERROR` and `WARN_ERROR_OPTIONS` are mutually exclusive +`WARN_ERROR` and `WARN_ERROR_OPTIONS` are mutually exclusive. You can only specify one, even when you're specifying the config in multiple places (e.g. env var + CLI flag), otherwise you'll see a usage error. +::: + +```text +dbt --warn-error-options '{"include": "all"}' run +... +``` + +```text +dbt --warn-error-options '{"include": "all", "exclude": ["NoNodesForSelectionCriteria"]}' run +... +``` + + +```text +dbt --warn-error-options '{"include": ["NoNodesForSelectionCriteria"]}' run +... +``` + +```text +DBT_WARN_ERROR_OPTIONS='{"include": ["NoNodesForSelectionCriteria"]}' dbt run +... +``` + + + +```yaml + +config: + warn_error_options: + include: all + exclude: + - NoNodesForSelectionCriteria + +``` + + + diff --git a/website/docs/reference/global-configs/yaml-configurations.md b/website/docs/reference/global-configs/yaml-configurations.md new file mode 100644 index 00000000000..73b8f558a9f --- /dev/null +++ b/website/docs/reference/global-configs/yaml-configurations.md @@ -0,0 +1,24 @@ +--- +title: "YAML configurations" +id: "yaml-configurations" +sidebar: "YAML configurations" +--- + +For most global configurations, you can set "user profile" configurations in the `config:` block of `profiles.yml`. This style of configuration sets default values for all projects using this profile directory—usually, all projects running on your local machine. + + + +```yaml + +config: + : true + +``` + + + + + +The exception: Some global configurations are actually set in `dbt_project.yml`, instead of `profiles.yml`, because they control where dbt places logs and artifacts. Those file paths are always relative to the location of `dbt_project.yml`. For more details, see ["Log and target paths"](#log-and-target-paths) below. + + \ No newline at end of file diff --git a/website/docs/reference/macro-properties.md b/website/docs/reference/macro-properties.md index 0f3e8833bde..9919835f3c5 100644 --- a/website/docs/reference/macro-properties.md +++ b/website/docs/reference/macro-properties.md @@ -13,13 +13,13 @@ version: 2 macros: - name: - [description](description): + [description](/reference/resource-properties/description): [docs](/reference/resource-configs/docs): show: true | false arguments: - name: - [type](argument-type): - [description](description): + [type](/reference/resource-properties/argument-type): + [description](/reference/resource-properties/description): - ... # declare properties of additional arguments - name: ... # declare properties of additional macros @@ -27,9 +27,3 @@ macros: ``` - - - -* `v0.16.0`: The ability to declare macro properties was introduced. - - diff --git a/website/docs/reference/model-configs.md b/website/docs/reference/model-configs.md index 87027448cfe..06830d0d32b 100644 --- a/website/docs/reference/model-configs.md +++ b/website/docs/reference/model-configs.md @@ -1,14 +1,11 @@ --- title: Model configurations +description: "Read this guide to understand model configurations in dbt." --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure models in all `.yml` files - - ## Related documentation * [Models](/docs/build/models) -* [`run` command](run) +* [`run` command](/reference/commands/run) ## Available configurations ### Model-specific configurations @@ -28,9 +25,9 @@ title: Model configurations ```yaml models: - [](resource-path): - [+](plus-prefix)[materialized](materialized): - [+](plus-prefix)[sql_header](sql_header): + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[materialized](/reference/resource-configs/materialized): + [+](/reference/resource-configs/plus-prefix)[sql_header](/reference/resource-configs/sql_header): ``` @@ -49,8 +46,8 @@ version: 2 models: - name: [] config: - [materialized](materialized): - [sql_header](sql_header): + [materialized](/reference/resource-configs/materialized): + [sql_header](/reference/resource-configs/sql_header): ``` @@ -66,8 +63,8 @@ models: ```jinja {{ config( - [materialized](materialized)="", - [sql_header](sql_header)="" + [materialized](/reference/resource-configs/materialized)="", + [sql_header](/reference/resource-configs/sql_header)="" ) }} ``` @@ -97,18 +94,19 @@ models: ```yaml models: - [](resource-path): - [+](plus-prefix)[enabled](enabled): true | false - [+](plus-prefix)[tags](resource-configs/tags): | [] - [+](plus-prefix)[pre-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[post-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[database](resource-configs/database): - [+](plus-prefix)[schema](resource-configs/schema): - [+](plus-prefix)[alias](resource-configs/alias): - [+](plus-prefix)[persist_docs](persist_docs): - [+](plus-prefix)[full_refresh](full_refresh): - [+](plus-prefix)[meta](meta): {} - [+](plus-prefix)[grants](grants): {} + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[enabled](/reference/resource-configs/enabled): true | false + [+](/reference/resource-configs/plus-prefix)[tags](/reference/resource-configs/tags): | [] + [+](/reference/resource-configs/plus-prefix)[pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[database](/reference/resource-configs/database): + [+](/reference/resource-configs/plus-prefix)[schema](/reference/resource-properties/schema): + [+](/reference/resource-configs/plus-prefix)[alias](/reference/resource-configs/alias): + [+](/reference/resource-configs/plus-prefix)[persist_docs](/reference/resource-configs/persist_docs): + [+](/reference/resource-configs/plus-prefix)[full_refresh](/reference/resource-configs/full_refresh): + [+](/reference/resource-configs/plus-prefix)[meta](/reference/resource-configs/meta): {} + [+](/reference/resource-configs/plus-prefix)[grants](/reference/resource-configs/grants): {} + [+](/reference/resource-configs/plus-prefix)[contract](/reference/resource-configs/contract): {} ``` @@ -127,17 +125,18 @@ version: 2 models: - name: [] config: - [enabled](enabled): true | false - [tags](resource-configs/tags): | [] - [pre-hook](pre-hook-post-hook): | [] - [post-hook](pre-hook-post-hook): | [] - [database](resource-configs/database): - [schema](resource-configs/schema): - [alias](resource-configs/alias): - [persist_docs](persist_docs): - [full_refresh](full_refresh): - [meta](meta): {} - [grants](grants): {} + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [database](/reference/resource-configs/database): + [schema](/reference/resource-properties/schema): + [alias](/reference/resource-configs/alias): + [persist_docs](/reference/resource-configs/persist_docs): + [full_refresh](/reference/resource-configs/full_refresh): + [meta](/reference/resource-configs/meta): {} + [grants](/reference/resource-configs/grants): {} + [contract](/reference/resource-configs/contract): {} ```
          @@ -153,16 +152,17 @@ models: ```jinja {{ config( - [enabled](enabled)=true | false, - [tags](resource-configs/tags)="" | [""], - [pre_hook](pre-hook-post-hook)="" | [""], - [post_hook](pre-hook-post-hook)="" | [""], - [database](resource-configs/database)="", - [schema](resource-configs/schema)="", - [alias](resource-configs/alias)="", - [persist_docs](persist_docs)={}, - [meta](meta)={} - [grants](grants)={} + [enabled](/reference/resource-configs/enabled)=true | false, + [tags](/reference/resource-configs/tags)="" | [""], + [pre_hook](/reference/resource-configs/pre-hook-post-hook)="" | [""], + [post_hook](/reference/resource-configs/pre-hook-post-hook)="" | [""], + [database](/reference/resource-configs/database)="", + [schema](/reference/resource-properties/schema)="", + [alias](/reference/resource-configs/alias)="", + [persist_docs](/reference/resource-configs/persist_docs)={}, + [meta](/reference/resource-configs/meta)={}, + [grants](/reference/resource-configs/grants)={}, + [contract](/reference/resource-configs/contract)={} ) }} ``` @@ -174,16 +174,17 @@ models: ### Warehouse-specific configurations -* [BigQuery configurations](bigquery-configs) -* [Redshift configurations](redshift-configs) -* [Snowflake configurations](snowflake-configs) -* [Spark configurations](spark-configs) +* [BigQuery configurations](/reference/resource-configs/bigquery-configs) +* [Redshift configurations](/reference/resource-configs/redshift-configs) +* [Snowflake configurations](/reference/resource-configs/snowflake-configs) +* [Databricks configurations](/reference/resource-configs/databricks-configs) +* [Spark configurations](/reference/resource-configs/spark-configs) ## Configuring models Models can be configured in one of three ways: 1. Using a `config()` Jinja macro within a model -2. Using a `config` [resource property](model-properties) in a `.yml` file +2. Using a `config` [resource property](/reference/model-properties) in a `.yml` file 3. From the `dbt_project.yml` file, under the `models:` key. Model configurations are applied hierarchically. The most-specific config always "wins": In the project file, configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project. To apply a configuration to a model, or directory of models, define the resource path as nested dictionary keys. @@ -224,7 +225,7 @@ models: ### Apply configurations to one model only -Some types of configurations are specific to a particular model. In these cases, placing configurations in the `dbt_project.yml` file can be unwieldy. Instead, you can specify these configurations at the top of a model `.sql` file, or in its individual yaml properties. +Some types of configurations are specific to a particular model. In these cases, placing configurations in the `dbt_project.yml` file can be unwieldy. Instead, you can specify these configurations at the top of a model `.sql` file, or in its individual YAML properties. diff --git a/website/docs/reference/model-properties.md b/website/docs/reference/model-properties.md index 1e1bb5e7ecf..63adc1f0d63 100644 --- a/website/docs/reference/model-properties.md +++ b/website/docs/reference/model-properties.md @@ -2,11 +2,7 @@ title: Model properties --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure models in all `.yml` files - - -Models properties can be declared in `.yml` files in your `models/` directory (as defined by the [`model-paths` config](model-paths)). +Models properties can be declared in `.yml` files in your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)). You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory. @@ -16,27 +12,63 @@ You can name these files `whatever_you_want.yml`, and nest them arbitrarily deep version: 2 models: - - [name](model_name): - [description](description): + - [name](/reference/resource-properties/model_name): + [description](/reference/resource-properties/description): [docs](/reference/resource-configs/docs): show: true | false - [config](resource-properties/config): - [](model-configs): - [tests](resource-properties/tests): + [latest_version](/reference/resource-properties/latest_version): + [deprecation_date](/reference/resource-properties/deprecation_date): + [access](/reference/resource-configs/access): private | protected | public + [config](/reference/resource-properties/config): + [](/reference/model-configs): + [constraints](/reference/resource-properties/constraints): + - + [tests](/reference/resource-properties/tests): - - ... # declare additional tests - columns: + [columns](/reference/resource-properties/columns): - name: # required - [description](description): - [meta](meta): {} - [quote](quote): true | false - [tests](resource-properties/tests): + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [quote](/reference/resource-properties/quote): true | false + [constraints](/reference/resource-properties/constraints): + - + [tests](/reference/resource-properties/tests): - - ... # declare additional tests - [tags](resource-configs/tags): [] + [tags](/reference/resource-configs/tags): [] - name: ... # declare properties of additional columns + [versions](/reference/resource-properties/versions): + - [v](/reference/resource-properties/versions#v): # required + [defined_in](/reference/resource-properties/versions#defined-in): + [description](/reference/resource-properties/description): + [docs](/reference/resource-configs/docs): + show: true | false + [access](/reference/resource-configs/access): private | protected | public + [constraints](/reference/resource-properties/constraints): + - + [config](/reference/resource-properties/config): + [](/reference/model-configs): + [tests](/reference/resource-properties/tests): + - + - ... # declare additional tests + columns: + # include/exclude columns from the top-level model properties + - [include](/reference/resource-properties/include-exclude): + [exclude](/reference/resource-properties/include-exclude): + # specify additional columns + - name: # required + [quote](/reference/resource-properties/quote): true | false + [constraints](/reference/resource-properties/constraints): + - + [tests](/reference/resource-properties/tests): + - + - ... # declare additional tests + [tags](/reference/resource-configs/tags): [] + - v: ... # declare additional versions + ``` diff --git a/website/docs/reference/node-selection/defer.md b/website/docs/reference/node-selection/defer.md index f5a6d5e5c0c..03c3b2aac12 100644 --- a/website/docs/reference/node-selection/defer.md +++ b/website/docs/reference/node-selection/defer.md @@ -2,34 +2,31 @@ title: "Defer" --- - +Defer is a powerful feature that makes it possible to run a subset of models or tests in a [sandbox environment](/docs/environments-in-dbt) without having to first build their upstream parents. This can save time and computational resources when you want to test a small number of models in a large project. -- **v0.18.0**: Introduced `--defer` and `--state` flags as beta features. -- **v0.19.0**: Changed `--defer` to use the current environment's resource, if it exists, and only fall back to the other environment's resource if the first does not. Also added support for `dbt test --defer`. +Defer requires that a manifest from a previous dbt invocation be passed to the `--state` flag or env var. Together with the `state:` selection method, these features enable "Slim CI". Read more about [state](/reference/node-selection/syntax#about-node-selection). - +An alternative command that accomplishes similar functionality for different use cases is `dbt clone` - see the docs for [clone](/reference/commands/clone#when-to-use-dbt-clone-instead-of-deferral) for more information. -**N.B.** Deferral is a powerful, complex feature that enables compelling workflows. We reserve the right to change the name and syntax in a future version of dbt to make the behavior clearer and more intuitive. For details, see [dbt#2968](https://github.com/dbt-labs/dbt-core/issues/2968). + -Defer is a powerful feature that makes it possible to run a subset of models or tests in a [sandbox environment](docs/collaborate/environments), without having to first build their upstream parents. This can save time and computational resources when you want to test a small number of models in a large project. +It is possible to use separate state for `state:modified` and `--defer`, by passing paths to different manifests to each of the `--state`/`DBT_STATE` and `--defer-state`/`DBT_DEFER_STATE`. This enables more granular control in cases where you want to compare against logical state from one environment or past point in time, and defer to applied state from a different environment or point in time. If `--defer-state` is not specified, deferral will use the manifest supplied to `--state`. In most cases, you will want to use the same state for both: compare logical changes against production, and also "fail over" to the production environment for unbuilt upstream resources. -Defer requires that a manifest from a previous dbt invocation be passed to the `--state` flag or env var. Together with the `state:` selection method, these features enable "Slim CI". Read more about [state](understanding-state). -### Usage + - +### Usage ```shell -$ dbt run --select [...] --defer --state path/to/artifacts -$ dbt test --select [...] --defer --state path/to/artifacts +dbt run --select [...] --defer --state path/to/artifacts +dbt test --select [...] --defer --state path/to/artifacts ``` - ```shell -$ dbt run --models [...] --defer --state path/to/artifacts -$ dbt test --models [...] --defer --state path/to/artifacts +dbt run --models [...] --defer --state path/to/artifacts +dbt test --models [...] --defer --state path/to/artifacts ``` @@ -46,13 +43,31 @@ When using defer, you may be selecting from production datasets, development dat - if you apply env-specific limits in dev but not prod, as you may end up selecting more data than you expect - when executing tests that depend on multiple parents (e.g. `relationships`), since you're testing "across" environments -Deferral requires both `--defer` and `--state` to be set, either by passing flags explicitly or by setting environment variables (`DBT_DEFER_TO_STATE` and `DBT_ARTIFACT_STATE_PATH`). If you use dbt Cloud, read about [how to set up CI jobs](/docs/deploy/cloud-ci-job). + + +Deferral requires both `--defer` and `--state` to be set, either by passing flags explicitly or by setting environment variables (`DBT_DEFER_TO_STATE` and `DBT_ARTIFACT_STATE_PATH`). If you use dbt Cloud, read about [how to set up CI jobs](/docs/deploy/continuous-integration). + + + + + +Deferral requires both `--defer` and `--state` to be set, either by passing flags explicitly or by setting environment variables (`DBT_DEFER` and `DBT_STATE`). If you use dbt Cloud, read about [how to set up CI jobs](/docs/deploy/continuous-integration). + + + + + +#### Favor state + +You can optionally skip the second criterion by passing the `--favor-state` flag. If passed, dbt will favor using the node defined in your `--state` namespace, even if the node exists in the current target. + + ### Example In my local development environment, I create all models in my target schema, `dev_alice`. In production, the same models are created in a schema named `prod`. -I access the dbt-generated [artifacts](artifacts) (namely `manifest.json`) from a production run, and copy them into a local directory called `prod-run-artifacts`. +I access the dbt-generated [artifacts](/docs/deploy/artifacts) (namely `manifest.json`) from a production run, and copy them into a local directory called `prod-run-artifacts`. ### run I've been working on `model_b`: @@ -72,9 +87,7 @@ group by 1 I want to test my changes. Nothing exists in my development schema, `dev_alice`. ### test -:::info -Before dbt v0.21, use the `--models` flag instead of `--select`. -::: + ```shell -$ dbt run --select model_b +dbt run --select "model_b" ``` @@ -115,7 +128,7 @@ Unless I had previously run `model_a` into this development environment, `dev_al ```shell -$ dbt run --select model_b --defer --state prod-run-artifacts +dbt run --select "model_b" --defer --state prod-run-artifacts ``` @@ -160,10 +173,6 @@ models: (A bit silly, since all the data in `model_b` had to come from `model_a`, but suspend your disbelief.) -:::info -Before dbt v0.21, use the `--models` flag instead of `--select`. -::: - ```shell -dbt test --select model_b +dbt test --select "model_b" ``` @@ -202,7 +211,7 @@ The `relationships` test requires both `model_a` and `model_b`. Because I did no ```shell -dbt test --select model_b --defer --state prod-run-artifacts +dbt test --select "model_b" --defer --state prod-run-artifacts ``` diff --git a/website/docs/reference/node-selection/exclude.md b/website/docs/reference/node-selection/exclude.md index 4fbc1a6e79e..d2c140d1bb5 100644 --- a/website/docs/reference/node-selection/exclude.md +++ b/website/docs/reference/node-selection/exclude.md @@ -1,36 +1,25 @@ --- -title: "Exclude" +title: "Exclude models from your run" +sidebar_label: "Exclude" --- ### Excluding models dbt provides an `--exclude` flag with the same semantics as `--select`. Models specified with the `--exclude` flag will be removed from the set of models selected with `--select`. - - -```bash -$ dbt run --select my_package.*+ --exclude my_package.a_big_model+ -``` - - - - ```bash -$ dbt run --models my_package.*+ --exclude my_package.a_big_model+ +dbt run --select "my_package".*+ --exclude "my_package.a_big_model+" # select all models in my_package and their children except a_big_model and its children ``` - - Exclude a specific resource by its name or lineage: ```bash # test -$ dbt test --exclude not_null_orders_order_id -$ dbt test --exclude orders +dbt test --exclude "not_null_orders_order_id" # test all models except the not_null_orders_order_id test +dbt test --exclude "orders" # test all models except tests associated with the orders model # seed -$ dbt seed --exclude account_parent_mappings +dbt seed --exclude "account_parent_mappings" # load all seeds except account_parent_mappings # snapshot -$ dbt snapshot --exclude snap_order_statuses -$ dbt test --exclude orders+ +dbt snapshot --exclude "snap_order_statuses" # execute all snapshots except snap_order_statuses ``` diff --git a/website/docs/reference/node-selection/graph-operators.md b/website/docs/reference/node-selection/graph-operators.md index 31318b1379a..8cba43e1b52 100644 --- a/website/docs/reference/node-selection/graph-operators.md +++ b/website/docs/reference/node-selection/graph-operators.md @@ -5,49 +5,26 @@ title: "Graph operators" ### The "plus" operator If placed at the front of the model selector, `+` will select all parents of the selected model. If placed at the end of the string, `+` will select all children of the selected model. - ```bash - $ dbt run --select my_model+ # select my_model and all children - $ dbt run --select +my_model # select my_model and all parents - $ dbt run --select +my_model+ # select my_model, and all of its parents and children +dbt run --select "my_model+" # select my_model and all children +dbt run --select "+my_model" # select my_model and all parents +dbt run --select "+my_model+" # select my_model, and all of its parents and children ``` - - - - ```bash - $ dbt run --models my_model+ # select my_model and all children - $ dbt run --models +my_model # select my_model and all parents - $ dbt run --models +my_model+ # select my_model, and all of its parents and children - ``` - - ### The "n-plus" operator -New in v0.18.0 You can adjust the behavior of the `+` operator by quantifying the number of edges to step through. - ```bash - $ dbt run --select my_model+1 # select my_model and its first-degree children - $ dbt run --select 2+my_model # select my_model, its first-degree parents, and its second-degree parents ("grandparents") - $ dbt run --select 3+my_model+4 # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree +dbt run --select "my_model+1" # select my_model and its first-degree children +dbt run --select "2+my_model" # select my_model, its first-degree parents, and its second-degree parents ("grandparents") +dbt run --select "3+my_model+4" # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree ``` - - - - ```bash - $ dbt run --models my_model+1 # select my_model and its first-degree children - $ dbt run --models 2+my_model # select my_model, its first-degree parents, and its second-degree parents ("grandparents") - $ dbt run --models 3+my_model+4 # select my_model, its parents up to the 3rd degree, and its children down to the 4th degree - ``` - - ### The "at" operator The `@` operator is similar to `+`, but will also include _the parents of the children of the selected model_. This is useful in continuous integration environments where you want to build a model and all of its children, but the _parents_ of those children might not exist in the database yet. The selector `@snowplow_web_page_context` will build all three models shown in the diagram below. @@ -55,25 +32,5 @@ The `@` operator is similar to `+`, but will also include _the parents of the ch ```bash -$ dbt run --models @my_model # select my_model, its children, and the parents of its children +dbt run --models @my_model # select my_model, its children, and the parents of its children ``` - -### The "star" operator -The `*` operator matches all models within a package or directory. - - - - ```bash - $ dbt run --select snowplow.* # run all of the models in the snowplow package - $ dbt run --select finance.base.* # run all of the models in models/finance/base - ``` - - - - - ```bash - $ dbt run --models snowplow.* # run all of the models in the snowplow package - $ dbt run --models finance.base.* # run all of the models in models/finance/base - ``` - - diff --git a/website/docs/reference/node-selection/methods.md b/website/docs/reference/node-selection/methods.md index 43c0fe1b628..e29612e3401 100644 --- a/website/docs/reference/node-selection/methods.md +++ b/website/docs/reference/node-selection/methods.md @@ -3,75 +3,80 @@ title: "Methods" --- Selector methods return all resources that share a common property, using the -syntax `method:value`. +syntax `method:value`. While it is recommended to explicitly denote the method, +you can omit it (the default value will be one of `path`, `file` or `fqn`). -### The "tag" method -The `tag:` method is used to select models that match a specified [tag](resource-configs/tags). + - +:::info New functionality +New in v1.5! +::: - ```bash - $ dbt run --select tag:nightly # run all models with the `nightly` tag - ``` +Many of the methods below support Unix-style wildcards: + +| Wildcard | Description | +| -------- | --------------------------------------------------------- | +| \* | matches any number of any characters (including none) | +| ? | matches any single character | +| [abc] | matches one character given in the bracket | +| [a-z] | matches one character from the range given in the bracket | + +For example: +``` +dbt list --select "*.folder_name.*" +dbt list --select "package:*_source" +``` - + +### The "tag" method +The `tag:` method is used to select models that match a specified [tag](/reference/resource-configs/tags). + ```bash - $ dbt run --models tag:nightly # run all models with the `nightly` tag - ``` +dbt run --select "tag:nightly" # run all models with the `nightly` tag +``` - ### The "source" method The `source` method is used to select models that select from a specified [source](/docs/build/sources#using-sources). Use in conjunction with the `+` operator. - ```bash - $ dbt run --select source:snowplow+ # run all models that select from Snowplow sources - ``` +dbt run --select "source:snowplow+" # run all models that select from Snowplow sources +``` - - +### The "resource_type" method +Use the `resource_type` method to select nodes of a particular type (`model`, `test`, `exposure`, and so on). This is similar to the `--resource-type` flag used by the [`dbt ls` command](/reference/commands/list). ```bash - $ dbt run --models source:snowplow+ # run all models that select from Snowplow sources - ``` +dbt build --select "resource_type:exposure" # build all resources upstream of exposures +dbt list --select "resource_type:test" # list all tests in your project +``` - +Note: This method doesn't work for sources, so use the [`--resource-type`](/reference/commands/list) option of the list command instead: + + ```bash +dbt list --resource-type source +``` ### The "path" method -The `path` method is used to select models located at or under a specific path. +The `path` method is used to select models/sources defined at or under a specific path. +Model definitions are in SQL/Python files (not YAML), and source definitions are in YAML files. While the `path` prefix is not explicitly required, it may be used to make selectors unambiguous. - ```bash # These two selectors are equivalent - dbt run --select path:models/staging/github - dbt run --select models/staging/github + dbt run --select "path:models/staging/github" + dbt run --select "models/staging/github" # These two selectors are equivalent - dbt run --select path:models/staging/github/stg_issues.sql - dbt run --select models/staging/github/stg_issues.sql + dbt run --select "path:models/staging/github/stg_issues.sql" + dbt run --select "models/staging/github/stg_issues.sql" ``` - - - - ```bash - # These two selectors are equivalent - dbt run --models path:models/staging/github - dbt run --models models/staging/github - - # These two selectors are equivalent - dbt run --models path:models/staging/github/stg_issues.sql - dbt run --models models/staging/github/stg_issues.sql - ``` - - @@ -80,70 +85,58 @@ The `file` method can be used to select a model by its filename, including the f ```bash # These are equivalent -dbt run --select some_model.sql -dbt run --select some_model +dbt run --select "file:some_model.sql" +dbt run --select "some_model.sql" +dbt run --select "some_model" ``` +### The "fqn" method + +The `fqn` method is used to select nodes based off their "fully qualified names" (FQN) within the dbt graph. The default output of [`dbt list`](/reference/commands/list) is a listing of FQN. + +```bash +dbt run --select "fqn:some_model" +dbt run --select "fqn:your_project.some_model" +dbt run --select "fqn:some_package.some_other_model" +``` + ### The "package" method -New in v0.18.0 The `package` method is used to select models defined within the root project or an installed dbt package. While the `package:` prefix is not explicitly required, it may be used to make selectors unambiguous. - - - ```bash - # These three selectors are equivalent - dbt run --select package:snowplow - dbt run --select snowplow - dbt run --select snowplow.* - ``` - - - ```bash # These three selectors are equivalent - dbt run --models package:snowplow - dbt run --models snowplow - dbt run --models snowplow.* - ``` + dbt run --select "package:snowplow" + dbt run --select "snowplow" + dbt run --select "snowplow.*" +``` - ### The "config" method -New in v0.18.0 -The `config` method is used to select models that match a specified [node config](configs-and-properties). +The `config` method is used to select models that match a specified [node config](/reference/configs-and-properties). - - ```bash - $ dbt run --select config.materialized:incremental # run all models that are materialized incrementally - $ dbt run --select config.schema:audit # run all models that are created in the `audit` schema - $ dbt run --select config.cluster_by:geo_country # run all models clustered by `geo_country` - ``` - - - ```bash - $ dbt run --models config.materialized:incremental # run all models that are materialized incrementally - $ dbt run --models config.schema:audit # run all models that are created in the `audit` schema - $ dbt run --models config.cluster_by:geo_country # run all models clustered by `geo_country` - ``` +dbt run --select "config.materialized:incremental" # run all models that are materialized incrementally +dbt run --select "config.schema:audit" # run all models that are created in the `audit` schema +dbt run --select "config.cluster_by:geo_country" # run all models clustered by `geo_country` +``` - While most config values are strings, you can also use the `config` method to match boolean configs, dictionary keys, and values in lists. For example, given a model with the following configurations: -``` + +```bash {{ config( materialized = 'incremental', unique_key = ['column_a', 'column_b'], @@ -156,99 +149,67 @@ select ... You can select using any of the following: ```bash -$ dbt ls -s config.materialized:incremental -$ dbt ls -s config.unique_key:column_a -$ dbt ls -s config.grants.select:reporter -$ dbt ls -s config.transient:true +dbt ls -s config.materialized:incremental +dbt ls -s config.unique_key:column_a +dbt ls -s config.grants.select:reporter +dbt ls -s config.transient:true ``` ### The "test_type" method - - -- New in v0.18.0 -- In v1.0.0, test types were renamed: "singular" (instead of "data") and "generic" (instead of "schema") - - The `test_type` method is used to select tests based on their type, `singular` or `generic`: - - ```bash - $ dbt test --select test_type:generic # run all generic tests - $ dbt test --select test_type:singular # run all singular tests - ``` - - - - ```bash - $ dbt test --models test_type:schema # run all schema tests - $ dbt test --models test_type:data # run all data tests - ``` +```bash +dbt test --select "test_type:generic" # run all generic tests +dbt test --select "test_type:singular" # run all singular tests +``` - ### The "test_name" method -New in v0.18.0 The `test_name` method is used to select tests based on the name of the generic test that defines it. For more information about how generic tests are defined, read about [tests](/docs/build/tests). - ```bash - $ dbt test --select test_name:unique # run all instances of the `unique` test - $ dbt test --select test_name:equality # run all instances of the `dbt_utils.equality` test - $ dbt test --select test_name:range_min_max # run all instances of a custom schema test defined in the local project, `range_min_max` - ``` +dbt test --select "test_name:unique" # run all instances of the `unique` test +dbt test --select "test_name:equality" # run all instances of the `dbt_utils.equality` test +dbt test --select "test_name:range_min_max" # run all instances of a custom schema test defined in the local project, `range_min_max` +``` - - - ```bash - $ dbt test --models test_name:unique # run all instances of the `unique` test - $ dbt test --models test_name:equality # run all instances of the `dbt_utils.equality` test - $ dbt test --models test_name:range_min_max # run all instances of a custom schema test defined in the local project, `range_min_max` - ``` +### The "state" method + +**N.B.** State-based selection is a powerful, complex feature. Read about [known caveats and limitations](/reference/node-selection/state-comparison-caveats) to state comparison. + + + +The `state` method is used to select nodes by comparing them against a previous version of the same project, which is represented by a [manifest](/reference/artifacts/manifest-json). The file path of the comparison manifest _must_ be specified via the `--state` flag or `DBT_ARTIFACT_STATE_PATH` environment variable. -### The "state" method - - - **v0.18.0** introduced `state:new` and `state:modified` - - **v0.21.0** introduced `modified` sub-selectors, and handling for upstream macro dependencies - + -**N.B.** State-based selection is a powerful, complex feature. Read about [known caveats and limitations](node-selection/state-comparison-caveats) to state comparison. +The `state` method is used to select nodes by comparing them against a previous version of the same project, which is represented by a [manifest](/reference/artifacts/manifest-json). The file path of the comparison manifest _must_ be specified via the `--state` flag or `DBT_STATE` environment variable. -The `state` method is used to select nodes by comparing them against a previous version of the same project, which is represented by a [manifest](artifacts/manifest-json). The file path of the comparison manifest _must_ be specified via the `--state` flag or `DBT_ARTIFACT_STATE_PATH` environment variable. + `state:new`: There is no node with the same `unique_id` in the comparison manifest `state:modified`: All new nodes, plus any changes to existing nodes. - - - ```bash - $ dbt test --select state:new # run all tests on new models + and new tests on old models - $ dbt run --select state:modified # run all models that have been modified - $ dbt ls --select state:modified # list all modified nodes (not just models) - ``` - - - ```bash - $ dbt test --models state:new # run all tests on new models + and new tests on old models - $ dbt run --models state:modified # run all models that have been modified - $ dbt ls --select state:modified # This really is still --select! list all modified nodes (not just models) +dbt test --select "state:new " # run all tests on new models + and new tests on old models +dbt run --select "state:modified" # run all models that have been modified +dbt ls --select "state:modified" # list all modified nodes (not just models) ``` - Because state comparison is complex, and everyone's project is different, dbt supports subselectors that include a subset of the full `modified` criteria: - `state:modified.body`: Changes to node body (e.g. model SQL, seed values) @@ -256,80 +217,154 @@ Because state comparison is complex, and everyone's project is different, dbt su - `state:modified.relation`: Changes to `database`/`schema`/`alias` (the database representation of this node), irrespective of `target` values or `generate_x_name` macros - `state:modified.persisted_descriptions`: Changes to relation- or column-level `description`, _if and only if_ `persist_docs` is enabled at each level - `state:modified.macros`: Changes to upstream macros (whether called directly or indirectly by another macro) +- `state:modified.contract`: Changes to a model's [contract](/reference/resource-configs/contract), which currently include the `name` and `data_type` of `columns`. Removing or changing the type of an existing column is considered a breaking change, and will raise an error. Remember that `state:modified` includes _all_ of the criteria above, as well as some extra resource-specific criteria, such as modifying a source's `freshness` or `quoting` rules or an exposure's `maturity` property. (View the source code for the full set of checks used when comparing [sources](https://github.com/dbt-labs/dbt-core/blob/9e796671dd55d4781284d36c035d1db19641cd80/core/dbt/contracts/graph/parsed.py#L660-L681), [exposures](https://github.com/dbt-labs/dbt-core/blob/9e796671dd55d4781284d36c035d1db19641cd80/core/dbt/contracts/graph/parsed.py#L768-L783), and [executable nodes](https://github.com/dbt-labs/dbt-core/blob/9e796671dd55d4781284d36c035d1db19641cd80/core/dbt/contracts/graph/parsed.py#L319-L330).) -### The "exposure" method -New in v0.18.1 + -The `exposure` method is used to select parent resources of a specified [exposure](exposures). Use in conjunction with the `+` operator. +There are two additional `state` selectors that complement `state:new` and `state:modified` by representing the inverse of those functions: +- `state:old` — A node with the same `unique_id` exists in the comparison manifest +- `state:unmodified` — All existing nodes with no changes - - - ```bash - $ dbt run --select +exposure:weekly_kpis # run all models that feed into the weekly_kpis exposure - $ dbt test --select +exposure:* # test all resources upstream of all exposures - $ dbt ls --select +exposure:* --resource-type source # list all sources upstream of all exposures - ``` +These selectors can help you shorten run times by excluding unchanged nodes. Currently, no subselectors are available at this time, but that might change as use cases evolve. - - ```bash - $ dbt run --models +exposure:weekly_kpis # run all models that feed into the weekly_kpis exposure - $ dbt test --models +exposure:* # test all resources upstream of all exposures - $ dbt ls --select +exposure:* --resource-type source # This really is still --select! list all sources upstream of all exposures - ``` +### The "exposure" method - +The `exposure` method is used to select parent resources of a specified [exposure](/docs/build/exposures). Use in conjunction with the `+` operator. + + + ```bash +dbt run --select "+exposure:weekly_kpis" # run all models that feed into the weekly_kpis exposure +dbt test --select "+exposure:*" # test all resources upstream of all exposures +dbt ls --select "+exposure:*" --resource-type source # list all sources upstream of all exposures +``` ### The "metric" method -New in v1.0.0 -The `metric` method is used to select parent resources of a specified [metric](metrics). Use in conjunction with the `+` operator. +The `metric` method is used to select parent resources of a specified [metric](/docs/build/metrics). Use in conjunction with the `+` operator. ```bash -$ dbt build --select +metric:weekly_active_users # build all resources upstream of weekly_active_users metric -$ dbt ls --select +metric:* --resource-type source # list all source tables upstream of all metrics +dbt build --select "+metric:weekly_active_users" # build all resources upstream of weekly_active_users metric +dbt ls --select "+metric:*" --resource-type source # list all source tables upstream of all metrics ``` ### The "result" method -New in v1.0.0 -The `result` method is related to the `state` method described above, and can be used to select resources based on their result status from a prior run. Note that one of the dbt commands [`run`, `test`, `build`, `seed`] must have been performed in order to create the result on which a result selector operates. You can use `result` selectors in conjunction with the `+` operator. +The `result` method is related to the `state` method described above and can be used to select resources based on their result status from a prior run. Note that one of the dbt commands [`run`, `test`, `build`, `seed`] must have been performed in order to create the result on which a result selector operates. You can use `result` selectors in conjunction with the `+` operator. ```bash -$ dbt run --select result:error # run all models that generated errors on the prior invocation of dbt run -$ dbt test --select result:fail # run all tests that failed on the prior invocation of dbt test -$ dbt build --select 1+result:fail # run all the models associated with failed tests from the prior invocation of dbt build -$ dbt seed --select result:error # run all seeds that generated errors on the prior invocation of dbt seed. +dbt run --select "result:error" --state path/to/artifacts # run all models that generated errors on the prior invocation of dbt run +dbt test --select "result:fail" --state path/to/artifacts # run all tests that failed on the prior invocation of dbt test +dbt build --select "1+result:fail" --state path/to/artifacts # run all the models associated with failed tests from the prior invocation of dbt build +dbt seed --select "result:error" --state path/to/artifacts # run all seeds that generated errors on the prior invocation of dbt seed. ``` ### The "source_status" method - + +Supported in v1.1 or higher. + +Another element of job state is the `source_status` of a prior dbt invocation. After executing `dbt source freshness`, for example, dbt creates the `sources.json` artifact which contains execution times and `max_loaded_at` dates for dbt sources. You can read more about `sources.json` on the ['sources'](/reference/artifacts/sources-json) page. + +The following dbt commands produce `sources.json` artifacts whose results can be referenced in subsequent dbt invocations: +- `dbt source freshness` -Only supported by v1.1 or newer. +After issuing one of the above commands, you can reference the source freshness results by adding a selector to a subsequent command as follows: + + + +```bash +# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts +``` - + -Only supported by v1.1 or newer. +```bash +# You can also set the DBT_STATE environment variable instead of the --state flag. +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts +``` -:::caution Experimental functionality -The `source_status` selection method is experimental and subject to change. During this time, ongoing improvements may limit this feature’s availability and cause breaking changes to its functionality. -::: + -Another element of job state is the `source_status` of a prior dbt invocation. After executing `dbt source freshness`, for example, dbt creates the `sources.json` artifact which contains execution times and `max_loaded_at` dates for dbt sources. You can read more about `sources.json` on the ['sources'](/docs/reference/artifacts/sources-json) page. -The following dbt commands produce `sources.json` artifacts whose results can be referenced in subsequent dbt invocations: -- `dbt source freshness` +### The "group" method + + +Supported in v1.5 or newer. + + + + + +The `group` method is used to select models defined within a [group](/reference/resource-configs/group). -After issuing one of the above commands, you can reference the source freshness results by adding a selector to a subsequent command as follows: ```bash -# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. -$ dbt source freshness # must be run again to compare current to previous state -$ dbt build --select source_status:fresher+ --state path/to/prod/artifacts +dbt run --select "group:finance" # run all models that belong to the finance group. +``` + + + +### The "access" method + + + +Supported in v1.5 or newer. + + + + + +The `access` method selects models based on their [access](/reference/resource-configs/access) property. + +```bash +dbt list --select "access:public" # list all public models +dbt list --select "access:private" # list all private models +dbt list --select "access:protected" # list all protected models +``` + + + +### The "version" method + + + +Supported in v1.5 or newer. + + + + + +The `version` method selects [versioned models](/docs/collaborate/govern/model-versions) based on their [version identifier](/reference/resource-properties/versions) and [latest version](/reference/resource-properties/latest_version). + +```bash +dbt list --select "version:latest" # only 'latest' versions +dbt list --select "version:prerelease" # versions newer than the 'latest' version +dbt list --select version:old # versions older than the 'latest' version + +dbt list --select "version:none" # models that are *not* versioned ``` + + +### The "semantic_model" method + +Supported in v1.6 or newer. + + + +The `semantic_model` method selects [semantic models](/docs/build/semantic-models). + +```bash +dbt list --select semantic_model:* # list all semantic models +dbt list --select +semantic_model:orders # list your semantic model named "orders" and all upstream resources +``` + + \ No newline at end of file diff --git a/website/docs/reference/node-selection/putting-it-together.md b/website/docs/reference/node-selection/putting-it-together.md index c8be96142d0..48fc5188b32 100644 --- a/website/docs/reference/node-selection/putting-it-together.md +++ b/website/docs/reference/node-selection/putting-it-together.md @@ -2,54 +2,29 @@ title: "Putting it together" --- - ```bash - $ dbt run --select my_package.*+ # select all models in my_package and their children - $ dbt run --select +some_model+ # select some_model and all parents and children +dbt run --select "my_package.*+" # select all models in my_package and their children +dbt run --select "+some_model+" # select some_model and all parents and children - $ dbt run --select tag:nightly+ # select "nightly" models and all children - $ dbt run --select +tag:nightly+ # select "nightly" models and all parents and children +dbt run --select "tag:nightly+" # select "nightly" models and all children +dbt run --select "+tag:nightly+" # select "nightly" models and all parents and children - $ dbt run --select @source:snowplow # build all models that select from snowplow sources, plus their parents +dbt run --select "@source:snowplow" # build all models that select from snowplow sources, plus their parents - $ dbt test --select config.incremental_strategy:insert_overwrite,test_name:unique # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy - ``` +dbt test --select "config.incremental_strategy:insert_overwrite,test_name:unique" # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy +``` - - - ```bash - $ dbt run --models my_package.*+ # select all models in my_package and their children - $ dbt run --models +some_model+ # select some_model and all parents and children - - $ dbt run --models tag:nightly+ # select "nightly" models and all children - $ dbt run --models +tag:nightly+ # select "nightly" models and all parents and children - - $ dbt run --models @source:snowplow # build all models that select from snowplow sources, plus their parents - - $ dbt test --models config.incremental_strategy:insert_overwrite,test_name:unique # execute all `unique` tests that select from models using the `insert_overwrite` incremental strategy - ``` - - This can get complex! Let's say I want a nightly run of models that build off snowplow data and feed exports, while _excluding_ the biggest incremental models (and one other model, to boot). - - - ```bash - $ dbt run --select @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing - ``` - - - ```bash - $ dbt run --models @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing - ``` +dbt run --select "@source:snowplow,tag:nightly models/export" --exclude "package:snowplow,config.materialized:incremental export_performance_timing" +``` - This command selects all models that: * Select from snowplow sources, plus their parents, _and_ are tagged "nightly" diff --git a/website/docs/reference/node-selection/set-operators.md b/website/docs/reference/node-selection/set-operators.md index 605558c0b92..af399b9cad5 100644 --- a/website/docs/reference/node-selection/set-operators.md +++ b/website/docs/reference/node-selection/set-operators.md @@ -3,80 +3,40 @@ title: "Set operators" --- ### Unions -Providing multiple space-delineated arguments to the `--select`, `--exclude`, or `--selector` flags selects +Providing multiple space-delineated arguments to the `--select` or `--exclude` flags selects the union of them all. If a resource is included in at least one selector, it will be included in the final set. Run snowplow_sessions, all ancestors of snowplow_sessions, fct_orders, and all ancestors of fct_orders: - ```bash - $ dbt run --select +snowplow_sessions +fct_orders +dbt run --select "+snowplow_sessions +fct_orders" ``` - - - - ```bash - $ dbt run --models +snowplow_sessions +fct_orders - ``` - - - ### Intersections -New in v0.18.0 -If multiple arguments to `--select`, `--exclude`, and `--select` can be comma-separated (with no whitespace in between), -dbt will select only resources which satisfy _all_ arguments. +If you separate multiple arguments for `--select` and `--exclude` with commas and no whitespace in between, dbt will select only resources that satisfy _all_ arguments. Run all the common ancestors of snowplow_sessions and fct_orders: - ```bash - $ dbt run --select +snowplow_sessions,+fct_orders - ``` - - - - - ```bash - $ dbt run --models +snowplow_sessions,+fct_orders - ``` +dbt run --select "+snowplow_sessions,+fct_orders" +``` - Run all the common descendents of stg_invoices and stg_accounts: - ```bash - $ dbt run --select stg_invoices+,stg_accounts+ +dbt run --select "stg_invoices+,stg_accounts+" ``` - - - - ```bash - $ dbt run --models stg_invoices+,stg_accounts+ - ``` - - Run models that are in the marts/finance subdirectory *and* tagged nightly: - - - ```bash - $ dbt run --select marts.finance,tag:nightly - ``` - - - ```bash - $ dbt run --models marts.finance,tag:nightly - ``` - - +dbt run --select "marts.finance,tag:nightly" +``` diff --git a/website/docs/reference/node-selection/state-comparison-caveats.md b/website/docs/reference/node-selection/state-comparison-caveats.md index 79b66598f41..73947c80a66 100644 --- a/website/docs/reference/node-selection/state-comparison-caveats.md +++ b/website/docs/reference/node-selection/state-comparison-caveats.md @@ -2,21 +2,17 @@ title: "Caveats to state comparison" --- -The [`state:` selection method](methods#the-state-method) is a wildly powerful feature, with a lot of underlying complexity. Below are a handful of considerations when setting up automated jobs that leverage state comparison. +The [`state:` selection method](/reference/node-selection/methods#the-state-method) is a powerful feature, with a lot of underlying complexity. Below are a handful of considerations when setting up automated jobs that leverage state comparison. ### Seeds -dbt stores a file hash of seed files that are <1 MB in size. If the contents of these seeds is modified, the seed will be included in `state:modified`. +dbt stores a file hash of seed files that are <1 MiB in size. If the contents of these seeds is modified, the seed will be included in `state:modified`. -If a seed file is >1 MB in size, dbt cannot compare its contents and will raise a warning as such. Instead, dbt will use only the seed's file path to detect changes. If the file path has changed, the seed will be included in `state:modified`; if it hasn't, it won't. +If a seed file is >1 MiB in size, dbt cannot compare its contents and will raise a warning as such. Instead, dbt will use only the seed's file path to detect changes. If the file path has changed, the seed will be included in `state:modified`; if it hasn't, it won't. ### Macros - - -- New in v0.21.0: dbt will mark modified any resource that depends on a changed macro, or on a macro that depends on a changed macro. - - +dbt will mark modified any resource that depends on a changed macro, or on a macro that depends on a changed macro. ### Vars @@ -31,8 +27,8 @@ The command `dbt test -s state:modified` will include both: As long as you're adding or changing tests at the same time that you're adding or changing the resources (models, seeds, snapshots) they select from, all should work the way you expect with "simple" state selection: ```shell -$ dbt run -s state:modified -$ dbt test -s state:modified +dbt run -s "state:modified" +dbt test -s "state:modified" ``` This can get complicated, however. If you add a new test without modifying its underlying model, or add a test that selects from a new model and an old unmodified one, you may need to test a model without having first run it. @@ -40,8 +36,8 @@ This can get complicated, however. If you add a new test without modifying its u In v0.18.0, you needed to handle this by building the unmodified models needed for modified tests: ```shell -$ dbt run -s state:modified @state:modified,1+test_type:data -$ dbt test -s state:modified +dbt run -s "state:modified @state:modified,1+test_type:data" +dbt test -s "state:modified" ``` In v0.19.0, dbt added support for deferring upstream references when testing. If a test selects from a model that doesn't exist as a database object in your current environment, dbt will look to the other environment instead—the one defined in your state manifest. This enables you to use "simple" state selection without risk of query failure, but it may have some surprising consequences for tests with multiple parents. For instance, if you have a `relationships` test that depends on one modified model and one unmodified model, the test query will select from data "across" two different environments. If you limit or sample your data in development and CI, it may not make much sense to test for referential integrity, knowing there's a good chance of mismatch. @@ -49,8 +45,8 @@ In v0.19.0, dbt added support for deferring upstream references when testing. If If you're a frequent user of `relationships` tests or data tests, or frequently find yourself adding tests without modifying their underlying models, consider tweaking the selection criteria of your CI job. For instance: ```shell -$ dbt run -s state:modified -$ dbt test -s state:modified --exclude test_name:relationships +dbt run -s "state:modified" +dbt test -s "state:modified" --exclude "test_name:relationships" ``` ### False positives @@ -60,16 +56,9 @@ State comparison works by identifying discrepancies between two manifests. Thos 1. Changes made to a project in development 2. Env-aware logic that causes different behavior based on the `target`, env vars, etc. -dbt will do its best to capture *only* changes that are the result of modifications made in development. In projects with intricate env-aware logic, dbt will err on the side of running too many models (i.e. false positives). Over the next several versions of dbt, We're working on: -- iterative improvements to dbt's built-in dectective abilities -- better options for more complex projects, in the form of more-specific subselectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704)) - - - -- v0.18.0: All env-aware logic results in false positives during state comparison, when comparing against a manifest generated with a different target. -- v0.19.0: dbt stores and compares unrendered Jinja expressions for configurations, allowing it to see past env-aware logic in `dbt_project.yml`. - - +dbt will do its best to capture *only* changes that are the result of modifications made in development. In projects with intricate env-aware logic, dbt will err on the side of running too many models (i.e. false positives). Over the next several versions of dbt, we're working on: +- iterative improvements to dbt's built-in detective abilities +- better options for more complex projects, in the form of more-specific sub-selectors (see [this issue](https://github.com/dbt-labs/dbt-core/issues/2704)) State comparison is now able to detect env-aware config in `dbt_project.yml`. For instance, this target-based config would register as a modification in v0.18.0, but in v0.19.0 it no longer will: diff --git a/website/docs/reference/node-selection/syntax.md b/website/docs/reference/node-selection/syntax.md index 5787ca3355b..616bcfe6447 100644 --- a/website/docs/reference/node-selection/syntax.md +++ b/website/docs/reference/node-selection/syntax.md @@ -6,33 +6,32 @@ dbt's node selection syntax makes it possible to run only specific resources in | command | argument(s) | | :------------------------------ | -------------------------------------------------------------------- | -| [run](commands/run) | `--select`, `--exclude`, `--selector`, `--defer` | -| [test](commands/test) | `--select`, `--exclude`, `--selector`, `--defer` | -| [seed](commands/seed) | `--select`, `--exclude`, `--selector` | -| [snapshot](commands/snapshot) | `--select`, `--exclude` `--selector` | -| [ls (list)](commands/list) | `--select`, `--exclude`, `--selector`, `--resource-type` | -| [compile](commands/compile) | `--select`, `--exclude`, `--selector` | -| [freshness](commands/source) | `--select`, `--exclude`, `--selector` | -| [build](commands/build) | `--select`, `--exclude`, `--selector`, `--resource-type`, `--defer` | +| [run](/reference/commands/run) | `--select`, `--exclude`, `--selector`, `--defer` | +| [test](/reference/commands/test) | `--select`, `--exclude`, `--selector`, `--defer` | +| [seed](/reference/commands/seed) | `--select`, `--exclude`, `--selector` | +| [snapshot](/reference/commands/snapshot) | `--select`, `--exclude` `--selector` | +| [ls (list)](/reference/commands/list) | `--select`, `--exclude`, `--selector`, `--resource-type` | +| [compile](/reference/commands/compile) | `--select`, `--exclude`, `--selector`, `--inline` | +| [freshness](/reference/commands/source) | `--select`, `--exclude`, `--selector` | +| [build](/reference/commands/build) | `--select`, `--exclude`, `--selector`, `--resource-type`, `--defer` | +| [docs generate](/reference/commands/cmd-docs) | `--select`, `--exclude`, `--selector`, `--defer` | :::info Nodes and resources -We use the terms "nodes" and "resources" interchangeably. These encompass all the models, tests, sources, seeds, snapshots, exposures, and analyses in your project. They are the objects that make up dbt's DAG (directed acyclic graph). +We use the terms "nodes" and "resources" interchangeably. These encompass all the models, tests, sources, seeds, snapshots, exposures, and analyses in your project. They are the objects that make up dbt's DAG (directed acyclic graph). ::: ## Specifying resources By default, `dbt run` executes _all_ of the models in the dependency graph; `dbt seed` creates all seeds, `dbt snapshot` performs every snapshot. The `--select` flag is used to specify a subset of nodes to execute. -:::info -Before dbt v0.21, certain commands (notably `run`, `test`, and `compile`) used a flag called `--models` instead of `--select`. The two were functionally identical. Those commands still support the `--models` flag for backwards compatibility. -::: +To follow [POSIX standards](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap12.html) and make things easier to understand, we recommend CLI users use quotes when passing arguments to the `--select` or `--exclude` option (including single or multiple space-delimited, or comma-delimited arguments). Not using quotes might not work reliably on all operating systems, terminals, and user interfaces. For example, `dbt run --select "my_dbt_project_name"` runs all models in your project. ### How does selection work? -1. dbt gathers all the resources that are matched by one or more of the `--select` criteria, in the order of selection methods (e.g. `tag:`), then graph operators (e.g. `+`), then finally set operators (unions, intersections, exclusions). +1. dbt gathers all the resources that are matched by one or more of the `--select` criteria, in the order of selection methods (e.g. `tag:`), then graph operators (e.g. `+`), then finally set operators ([unions](/reference/node-selection/set-operators#unions), [intersections](/reference/node-selection/set-operators#intersections), [exclusions](/reference/node-selection/exclude)). -2. The selected resources may be models, sources, seeds, snapshots, tests. (Tests can also be selected "indirectly" via their parents; see [test selection examples](test-selection-examples) for details.) +2. The selected resources may be models, sources, seeds, snapshots, tests. (Tests can also be selected "indirectly" via their parents; see [test selection examples](/reference/node-selection/test-selection-examples) for details.) 3. dbt now has a list of still-selected resources of varying types. As a final step, it tosses away any resource that does not match the resource type of the current task. (Only seeds are kept for `dbt seed`, only models for `dbt run`, only tests for `dbt test`, and so on.) @@ -53,67 +52,159 @@ The `--select` flag accepts one or more arguments. Each argument can be one of: Examples: - ```bash - $ dbt run --select my_dbt_project_name # runs all models in your project - $ dbt run --select my_dbt_model # runs a specific model - $ dbt run --select path.to.my.models # runs all models in a specific directory - $ dbt run --select my_package.some_model # run a specific model in a specific package - $ dbt run --select tag:nightly # run models with the "nightly" tag - $ dbt run --select path/to/models # run models contained in path/to/models - $ dbt run --select path/to/my_model.sql # run a specific model by its path +dbt run --select "my_dbt_project_name" # runs all models in your project +dbt run --select "my_dbt_model" # runs a specific model +dbt run --select "path.to.my.models" # runs all models in a specific directory +dbt run --select "my_package.some_model" # run a specific model in a specific package +dbt run --select "tag:nightly" # run models with the "nightly" tag +dbt run --select "path/to/models" # run models contained in path/to/models +dbt run --select "path/to/my_model.sql" # run a specific model by its path ``` - - +dbt supports a shorthand language for defining subsets of nodes. This language uses the characters `+`, `@`, `*`, and `,`. + ```bash - $ dbt run --models my_dbt_project_name # runs all models in your project - $ dbt run --models my_dbt_model # runs a specific model - $ dbt run --models path.to.my.models # runs all models in a specific directory - $ dbt run --models my_package.some_model # run a specific model in a specific package - $ dbt run --models tag:nightly # run models with the "nightly" tag - $ dbt run --models path/to/models # run models contained in path/to/models - $ dbt run --models path/to/my_model.sql # run a specific model by its path - ``` +# multiple arguments can be provided to --select + dbt run --select "my_first_model my_second_model" + +# these arguments can be projects, models, directory paths, tags, or sources +dbt run --select "tag:nightly my_model finance.base.*" + +# use methods and intersections for more complex selectors +dbt run --select "path:marts/finance,tag:nightly,config.materialized:table" +``` + +As your selection logic gets more complex, and becomes unwieldly to type out as command-line arguments, +consider using a [yaml selector](/reference/node-selection/yaml-selectors). You can use a predefined definition with the `--selector` flag. +Note that when you're using `--selector`, most other flags (namely `--select` and `--exclude`) will be ignored. + + + + +## Stateful selection + +One of the greatest underlying assumptions about dbt is that its operations should be **stateless** and ****. That is, it doesn't matter how many times a model has been run before, or if it has ever been run before. It doesn't matter if you run it once or a thousand times. Given the same raw data, you can expect the same transformed result. A given run of dbt doesn't need to "know" about _any other_ run; it just needs to know about the code in the project and the objects in your database as they exist _right now_. + +That said, dbt does store "state"—a detailed, point-in-time view of project resources (also referred to as nodes), database objects, and invocation results—in the form of its [artifacts](/docs/deploy/artifacts). If you choose, dbt can use these artifacts to inform certain operations. Crucially, the operations themselves are still stateless and : given the same manifest and the same raw data, dbt will produce the same transformed result. + +dbt can leverage artifacts from a prior invocation as long as their file path is passed to the `--state` flag. This is a prerequisite for: +- [The `state:` selector](/reference/node-selection/methods#the-state-method), whereby dbt can identify resources that are new or modified +by comparing code in the current project against the state manifest. +- [Deferring](/reference/node-selection/defer) to another environment, whereby dbt can identify upstream, unselected resources that don't exist in your current environment and instead "defer" their references to the environment provided by the state manifest. +- The [`dbt clone` command](/reference/commands/clone), whereby dbt can clone nodes based on their location in the manifest provided to the `--state` flag. + +Together, the `state:` selector and deferral enable ["slim CI"](/guides/legacy/best-practices#run-only-modified-models-to-test-changes-slim-ci). We expect to add more features in future releases that can leverage artifacts passed to the `--state` flag. + +### Establishing state + +State and defer can be set by environment variables as well as CLI flags: + + + +- `--state` or `DBT_ARTIFACT_STATE_PATH`: file path +- `--defer` or `DBT_DEFER_TO_STATE`: boolean -dbt supports a shorthand language for defining subsets of nodes. This language uses the characters `+`, `@`, `*`, and `,`. + - +- `--state` or `DBT_STATE`: file path +- `--defer` or `DBT_DEFER`: boolean - ```bash - # multiple arguments can be provided to --select - $ dbt run --select my_first_model my_second_model +:::warning Syntax deprecated - # these arguments can be projects, models, directory paths, tags, or sources - $ dbt run --select tag:nightly my_model finance.base.* +In dbt v1.5, we deprecated the original syntax for state (`DBT_ARTIFACT_STATE_PATH`) and defer (`DBT_DEFER_TO_STATE`). Although dbt supports backward compatibility with the old syntax, we will remove it in a future release that we have not yet determined. - # use methods and intersections for more complex selectors - $ dbt run --select path:marts/finance,tag:nightly,config.materialized:table - ``` +::: - - ```bash - # multiple arguments can be provided to --select - $ dbt run --models my_first_model my_second_model + - # these arguments can be projects, models, directory paths, tags, or sources - $ dbt run --models tag:nightly my_model finance.base.* +- `--state` or `DBT_STATE`: file path +- `--defer` or `DBT_DEFER`: boolean +- `--defer-state` or `DBT_DEFER_STATE`: file path to use for deferral only (optional) - # use methods and intersections for more complex selectors - $ dbt run --models path:marts/finance,tag:nightly,config.materialized:table - ``` +If `--defer-state` is not specified, deferral will use the artifacts supplied by `--state`. This enables more granular control in cases where you want to compare against logical state from one environment or past point in time, and defer to applied state from a different environment or point in time. -As your selection logic gets more complex, and becomes unwieldly to type out as command-line arguments, -consider using a [yaml selector](yaml-selectors). You can use a predefined definition with the `--selector` flag. -Note that when you're using `--selector`, most other flags (namely `--select` and `--exclude`) will be ignored. +If both the flag and env var are provided, the flag takes precedence. - - +#### Notes: +- The `--state` artifacts must be of schema versions that are compatible with the currently running dbt version. +- The path to state artifacts can be set via the `--state` flag or `DBT_ARTIFACT_STATE_PATH` environment variable. If both the flag and env var are provided, the flag takes precedence. +- These are powerful, complex features. Read about [known caveats and limitations](/reference/node-selection/state-comparison-caveats) to state comparison. + +### The "result" status + +Another element of job state is the `result` of a prior dbt invocation. After executing a `dbt run`, for example, dbt creates the `run_results.json` artifact which contains execution times and success / error status for dbt models. You can read more about `run_results.json` on the ['run results'](/reference/artifacts/run-results-json) page. + +The following dbt commands produce `run_results.json` artifacts whose results can be referenced in subsequent dbt invocations: +- `dbt run` +- `dbt test` +- `dbt build` (new in dbt version v0.21.0) +- `dbt seed` + +After issuing one of the above commands, you can reference the results by adding a selector to a subsequent command as follows: + +```bash +# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. +dbt run --select "result: --defer --state path/to/prod/artifacts" +``` + +The available options depend on the resource (node) type: + +| | model | seed | snapshot | test | +|----------------|-------|------|------|----------| +| `result:error` | ✅ | ✅ | ✅ | ✅ | +| `result:success` | ✅ | ✅ | ✅ | | +| `result:skipped` | ✅ | | ✅ | ✅ | +| `result:fail` | | | | ✅ | +| `result:warn` | | | | ✅ | +| `result:pass` | | | | ✅ | + +### Combining `state` and `result` selectors + +The state and result selectors can also be combined in a single invocation of dbt to capture errors from a previous run OR any new or modified models. + +```bash +dbt run --select "result:+ state:modified+ --defer --state ./" +``` + +### Fresh rebuilds + +Only supported by v1.1 or newer. + +When a job is selected, dbt Cloud will surface the artifacts from that job's most recent successful run. dbt will then use those artifacts to determine the set of fresh sources. In your job commands, you can signal to dbt to run and test only on these fresher sources and their children by including the `source_status:fresher+` argument. This requires both previous and current state to have the `sources.json` artifact be available. Or plainly said, both job states need to run `dbt source freshness`. + +As example: + +```bash +# Command step order +dbt source freshness +dbt build --select "source_status:fresher+" +``` + + +For more example commands, refer to [Pro-tips for workflows](/guides/legacy/best-practices.md#pro-tips-for-workflows). + +### The "source_status" status + +Only supported by v1.1 or newer. + +Another element of job state is the `source_status` of a prior dbt invocation. After executing `dbt source freshness`, for example, dbt creates the `sources.json` artifact which contains execution times and `max_loaded_at` dates for dbt sources. You can read more about `sources.json` on the ['sources'](/reference/artifacts/sources-json) page. + +The following dbt commands produce `sources.json` artifacts whose results can be referenced in subsequent dbt invocations: +- `dbt source freshness` + +After issuing one of the above commands, you can reference the source freshness results by adding a selector to a subsequent command as follows: + +```bash +# You can also set the DBT_ARTIFACT_STATE_PATH environment variable instead of the --state flag. +dbt source freshness # must be run again to compare current to previous state +dbt build --select "source_status:fresher+" --state path/to/prod/artifacts +``` diff --git a/website/docs/reference/node-selection/test-selection-examples.md b/website/docs/reference/node-selection/test-selection-examples.md index fcb4ecd8ccc..feb3898c230 100644 --- a/website/docs/reference/node-selection/test-selection-examples.md +++ b/website/docs/reference/node-selection/test-selection-examples.md @@ -11,253 +11,310 @@ Like all resource types, tests can be selected **directly**, by methods and oper Unlike other resource types, tests can also be selected **indirectly**. If a selection method or operator includes a test's parent(s), the test will also be selected. [See below](#indirect-selection) for more details. - - -* `v0.20.0`: Test selection is no longer greedy for indirect inclusion (ALL parents must be selected for the test to be selected). It is still greedy for indirect exclusion (if ANY parent is excluded, the test is excluded). -* `v0.21.0`: Introduce `--greedy` flag (and `greedy` selector property), to optionally include tests that are indirectly selected and have an unselected parent. -* `v1.0.0`: Renamed the `--greedy` flag/property to `indirect_selection`, and set its default back to "eager" (pre-v0.20). You can achieve the "cautious" behavior introduced in v0.20 by setting the flag/property to `cautious`. - - - Test selection is powerful, and we know it can be tricky. To that end, we've included lots of examples below: ### Direct selection - - -* `v0.18.0`: Introduced the `test_type` selection method. In previous versions, similar behavior is possible via the `--schema` or `--data` flags. -- `v1.0.0`: Renamed test types: "generic" (formerly "schema") and "singular" (formerly "data"). Removed support for the `--schema` and `--data` flags. - - - Run generic tests only: - ```bash - $ dbt test --select test_type:generic + dbt test --select "test_type:generic" ``` - - +Run singular tests only: + ```bash - $ dbt test --models test_type:data + dbt test --select "test_type:singular" ``` - +In both cases, `test_type` checks a property of the test itself. These are forms of "direct" test selection. -Run singular tests only: +### Indirect selection - + - ```bash - $ dbt test --select test_type:singular - ``` +There are two modes to configure the behavior when performing indirect selection (with `eager` as the default): - - +1. `eager` (default) - include ANY test that references the selected nodes +1. `cautious` - restrict to tests that ONLY refer to selected nodes - ```bash - $ dbt test --models test_type:data - ``` +Note that test exclusion is always greedy: if ANY parent is explicitly excluded, the test will be excluded as well. + +The "cautious" mode can be useful in environments when you're only building a subset of your DAG, and you want to avoid test failures in "eager" mode caused by unbuilt resources. (Another way to achieve this is with [deferral](/reference/node-selection/defer)). -In both cases, `test_type` checks a property of the test itself. These are forms of "direct" test selection. + -### Indirect selection +There are three modes to configure the behavior when performing indirect selection (with `eager` as the default): - +1. `eager` (default) - include ANY test that references the selected nodes +1. `cautious` - restrict to tests that ONLY refer to selected nodes +1. `buildable` - restrict to tests that ONLY refer to selected nodes (or their ancestors) - ```bash - $ dbt test --select customers - $ dbt test --select orders - ``` +Note that test exclusion is always greedy: if ANY parent is explicitly excluded, the test will be excluded as well. + +The "buildable" and "cautious" modes can be useful in environments when you're only building a subset of your DAG, and you want to avoid test failures in "eager" mode caused by unbuilt resources. (Another way to achieve this is with [deferral](/reference/node-selection/defer)). - - ```bash - $ dbt test --models customers - $ dbt test --models orders - ``` + + +These are the modes to configure the behavior when performing indirect selection (with `eager` as the default): + +1. `eager` (default) - include ANY test that references the selected nodes +1. `cautious` - restrict to tests that ONLY refer to selected nodes +1. `buildable` - restrict to tests that ONLY refer to selected nodes (or their ancestors) +1. `empty` - restrict to tests that are only for the selected node and ignore all tests from the attached nodes + +Note that test exclusion is always greedy: if ANY parent is explicitly excluded, the test will be excluded as well. + +The "buildable", "cautious", and "empty" modes can be useful in environments when you're only building a subset of your DAG, and you want to avoid test failures in "eager" mode caused by unbuilt resources. (Another way to achieve this is with [deferral](/reference/node-selection/defer)). -These are examples of "indirect" selection: `customers` and `orders` select models (whether by name or path). Any tests defined on either `customers` or `orders` will be selected indirectly, and thereby included. + - By default, a test will run when ANY parent is selected; we call this "eager" indirect selection. In this example, that would include _any_ test that touches either `customers` or `orders`, even if it touches other models as well. + -It is possible to prevent tests from running if one or more of its parents is unselected, however; we call this "cautious" indirect selection. This can be useful in environments when you're only building a subset of your DAG, and you want to avoid test failures by tests that depend on unbuilt resources. (Another way to achieve this is with [deferral](defer)). + + -With `dbt test --indirect-selection=cautious` (or setting `indirect_selection: cautious` in a [yaml selector](yaml-selectors)) tests will be indirectly selected only if **ALL** of its parents are included by the selection criteria. If any parent is missing, that test won't run. Note that test _exclusion_ is always greedy: if **ANY** parent is explicitly excluded, the test will be excluded as well. +By default, a test will run when ANY parent is selected; we call this "eager" indirect selection. In this example, that would include any test that references orders, even if it references other models as well. -Imagine a `relationships` test between `customers` and `orders`. By default, the selection criteria above would select that test "eagerly." If you opt for "cautious" indirect selection instead, the `relationships` test would _not_ be selected by the criteria above, because one of its parents is unselected. It would be selected indirectly only ("cautiously") if both parents are selected: +In this mode, any test that depends on unbuilt resources will raise an error. ```shell -$ dbt test --select customers orders --indirect-selection=cautious +dbt test --select "orders" +dbt build --select "orders" ``` - + - ```bash - $ dbt test --select customers orders - ``` + + +It is possible to prevent tests from running if one or more of its parents is unselected (and therefore unbuilt); we call this "cautious" indirect selection. + +It will only include tests whose references are each within the selected nodes. + +Put another way, it will prevent tests from running if one or more of its parents is unselected. + +```shell + +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + +``` + + + + - - ```bash - $ dbt test --models customers orders - ``` + + + + + +By default, a test will run when ANY parent is selected; we call this "eager" indirect selection. In this example, that would include any test that references orders, even if it references other models as well. + +In this mode, any test that depends on unbuilt resources will raise an error. + +```shell +dbt test --select "orders" +dbt build --select "orders" +``` + + + + + +It is possible to prevent tests from running if one or more of its parents is unselected (and therefore unbuilt); we call this "cautious" indirect selection. + +It will only include tests whose references are each within the selected nodes. + +Put another way, it will prevent tests from running if one or more of its parents is unselected. + +```shell + +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + +``` + + + + + +This mode is similarly conservative like "cautious", but is slightly more inclusive. + +It will only include tests whose references are each within the selected nodes (or their ancestors). + +This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input). + +```shell +dbt test --select "orders" --indirect-selection=buildable +dbt build --select "orders" --indirect-selection=buildable + +``` + + + + - ### Syntax examples + -The following examples should feel somewhat familiar if you're used to executing `dbt run` with the `--select` option to build parts of your DAG: + + - +By default, a test will run when ANY parent is selected; we call this "eager" indirect selection. In this example, that would include any test that references orders, even if it references other models as well. - ```bash - # Run tests on a model (indirect selection) - $ dbt test --select customers +In this mode, any test that depends on unbuilt resources will raise an error. - # Run tests on all models in the models/staging/jaffle_shop directory (indirect selection) - $ dbt test --select staging.jaffle_shop +```shell +dbt test --select "orders" +dbt build --select "orders" +``` - # Run tests downstream of a model (note this will select those tests directly!) - $ dbt test --select stg_customers+ + - # Run tests upstream of a model (indirect selection) - $ dbt test --select +stg_customers + - # Run tests on all models with a particular tag (direct + indirect) - $ dbt test --select tag:my_model_tag +It is possible to prevent tests from running if one or more of its parents is unselected (and therefore unbuilt); we call this "cautious" indirect selection. - # Run tests on all models with a particular materialization (indirect selection) - $ dbt test --select config.materialized:table +It will only include tests whose references are each within the selected nodes. - ``` +Put another way, it will prevent tests from running if one or more of its parents is unselected. + +```shell +dbt test --select "orders" --indirect-selection=cautious +dbt build --select "orders" --indirect-selection=cautious + +``` + + + + + +This mode is similarly conservative like "cautious", but is slightly more inclusive. + +It will only include tests whose references are each within the selected nodes (or their ancestors). + +This is useful in the same scenarios as "cautious", but also includes when a test depends on a model **and** a direct ancestor of that model (like confirming an aggregation has the same totals as its input). + +```shell +dbt test --select "orders" --indirect-selection=buildable +dbt build --select "orders" --indirect-selection=buildable +``` + + + + + +This mode will only include tests whose references are each within the selected nodes and will ignore all tests from attached nodes. + +```shell + +dbt test --select "orders" --indirect-selection=empty +dbt build --select "orders" --indirect-selection=empty + +``` + + + + - + + + +### Syntax examples + +Setting `indirect_selection` can also be specified in a [yaml selector](/reference/node-selection/yaml-selectors#indirect-selection). + +The following examples should feel somewhat familiar if you're used to executing `dbt run` with the `--select` option to build parts of your DAG: + ```bash # Run tests on a model (indirect selection) - $ dbt test --models customers + dbt test --select "customers" + + # Run tests on two or more specific models (indirect selection) + dbt test --select "customers orders" # Run tests on all models in the models/staging/jaffle_shop directory (indirect selection) - $ dbt test --models staging.jaffle_shop + dbt test --select "staging.jaffle_shop" # Run tests downstream of a model (note this will select those tests directly!) - $ dbt test --models stg_customers+ + dbt test --select "stg_customers+" # Run tests upstream of a model (indirect selection) - $ dbt test --models +stg_customers + dbt test --select "+stg_customers" # Run tests on all models with a particular tag (direct + indirect) - $ dbt test --models tag:my_model_tag + dbt test --select "tag:my_model_tag" # Run tests on all models with a particular materialization (indirect selection) - $ dbt test --models config.materialized:table + dbt test --select "config.materialized:table" ``` - - The same principle can be extended to tests defined on other resource types. In these cases, we will execute all tests defined on certain sources via the `source:` selection method: - ```bash # tests on all sources - $ dbt test --select source:* - # tests on one source - $ dbt test --select source:jaffle_shop - - # tests on one source table - $ dbt test --select source:jaffle_shop.customers - - # tests on everything _except_ sources - $ dbt test --exclude source:* - ``` - - - - - ```bash - # tests on all sources - $ dbt test --models source:* + dbt test --select "source:*" # tests on one source - $ dbt test --models source:jaffle_shop + dbt test --select "source:jaffle_shop" + + # tests on two or more specific sources + dbt test --select "source:jaffle_shop source:raffle_bakery" # tests on one source table - $ dbt test --models source:jaffle_shop.customers + dbt test --select "source:jaffle_shop.customers" # tests on everything _except_ sources - $ dbt test --exclude source:* + dbt test --exclude "source:*" ``` - - ### More complex selection Through the combination of direct and indirect selection, there are many ways to accomplish the same outcome. Let's say we have a data test named `assert_total_payment_amount_is_positive` that depends on a model named `payments`. All of the following would manage to select and execute that test specifically: - ```bash - $ dbt test --select assert_total_payment_amount_is_positive # directly select the test by name - $ dbt test --select payments,test_type:data # indirect selection, v0.18.0 - $ dbt test --select payments --data # indirect selection, earlier versions - ``` - - + dbt test --select "assert_total_payment_amount_is_positive" # directly select the test by name + dbt test --select "payments,test_type:singular" # indirect selection, v1.2 + dbt test --select "payments,test_type:data" # indirect selection, v0.18.0 + dbt test --select "payments" --data # indirect selection, earlier versions - ```bash - $ dbt test --models assert_total_payment_amount_is_positive # directly select the test by name - $ dbt test --models payments,test_type:data # indirect selection, v0.18.0 - $ dbt test --models payments --data # indirect selection, earlier versions ``` - As long as you can select a common property of a group of resources, indirect selection allows you to execute all the tests on those resources, too. In the example above, we saw it was possible to test all table-materialized models. This principle can be extended to other resource types, too: - ```bash # Run tests on all models with a particular materialization - $ dbt test --select config.materialized:table + dbt test --select "config.materialized:table" # Run tests on all seeds, which use the 'seed' materialization - $ dbt test --select config.materialized:seed + dbt test --select "config.materialized:seed" # Run tests on all snapshots, which use the 'snapshot' materialization - $ dbt test --select config.materialized:snapshot - ``` - - - - - ```bash - # Run tests on all models with a particular materialization - $ dbt test --models config.materialized:table + dbt test --select "config.materialized:snapshot" - # Run tests on all seeds, which use the 'seed' materialization - $ dbt test --models config.materialized:seed - - # Run tests on all snapshots, which use the 'snapshot' materialization - $ dbt test --models config.materialized:snapshot ``` - - Note that this functionality may change in future versions of dbt. ### Run tests on tagged columns @@ -273,29 +330,20 @@ models: - name: orders columns: - name: order_id - tests: tags: [my_column_tag] + tests: - unique ``` - ```bash - $ dbt test --select tag:my_column_tag - ``` - - - + dbt test --select "tag:my_column_tag" - ```bash - $ dbt test --models tag:my_column_tag ``` - - Currently, tests "inherit" tags applied to columns, sources, and source tables. They do _not_ inherit tags applied to models, seeds, or snapshots. In all likelihood, those tests would still be selected indirectly, because the tag selects its parent. This is a subtle distinction, and it may change in future versions of dbt. ### Run tagged tests only @@ -319,17 +367,8 @@ models: - ```bash - $ dbt test --select tag:my_test_tag - ``` + dbt test --select "tag:my_test_tag" - - - - ```bash - $ dbt test --models tag:my_test_tag ``` - - diff --git a/website/docs/reference/node-selection/yaml-selectors.md b/website/docs/reference/node-selection/yaml-selectors.md index eeaa7be7267..1e3f8d8d1e2 100644 --- a/website/docs/reference/node-selection/yaml-selectors.md +++ b/website/docs/reference/node-selection/yaml-selectors.md @@ -2,14 +2,6 @@ title: "YAML Selectors" --- - - -- **v0.18.0**: Introduced YAML selectors -- **v0.19.0**: Added optional `description` property. Selectors appear in `manifest.json` under a new `selectors` key. -- **v0.21.0**: Added optional `default` + `greedy` properties - - - Write resource selectors in YAML, save them with a human-friendly name, and reference them using the `--selector` flag. By recording selectors in a top-level `selectors.yml` file: @@ -39,35 +31,41 @@ Each `definition` is comprised of one or more arguments, which can be one of the * **Key-value:** pairs in the form `method: value` * **Full YAML:** fully specified dictionaries with items for `method`, `value`, operator-equivalent keywords, and support for `exclude` -Use `union` and `intersection` to organize multiple arguments. +Use the `union` and `intersection` operator-equivalent keywords to organize multiple arguments. ### CLI-style + ```yml definition: 'tag:nightly' ``` -This simple syntax supports use of the `+`, `@`, and `*` operators. It does -not support `exclude`. +This simple syntax supports use of the `+`, `@`, and `*` [graph](/reference/node-selection/graph-operators) operators, but it does not support [set](/reference/node-selection/set-operators) operators or `exclude`. ### Key-value + ```yml definition: tag: nightly ``` -This simple syntax does not support any operators or `exclude`. +This simple syntax does not support any [graph](/reference/node-selection/graph-operators) or [set](/reference/node-selection/set-operators) operators or `exclude`. ### Full YAML -This is the most thorough syntax, which can include graph and set operators. +This is the most thorough syntax, which can include the operator-equivalent keywords for [graph](/reference/node-selection/graph-operators) and [set](/reference/node-selection/set-operators) operators. + +Review [methods](/reference/node-selection/methods) for the available list. + + + ```yml definition: method: tag value: nightly - # Optional keywords map to the `+` and `@` operators: + # Optional keywords map to the `+` and `@` graph operators: children: true | false parents: true | false @@ -80,6 +78,53 @@ definition: indirect_selection: eager | cautious # include all tests selected indirectly? eager by default ``` + + + + +```yml +definition: + method: tag + value: nightly + + # Optional keywords map to the `+` and `@` graph operators: + + children: true | false + parents: true | false + + children_depth: 1 # if children: true, degrees to include + parents_depth: 1 # if parents: true, degrees to include + + childrens_parents: true | false # @ operator + + indirect_selection: eager | cautious | buildable # include all tests selected indirectly? eager by default +``` + + + + + +```yml +definition: + method: tag + value: nightly + + # Optional keywords map to the `+` and `@` graph operators: + + children: true | false + parents: true | false + + children_depth: 1 # if children: true, degrees to include + parents_depth: 1 # if parents: true, degrees to include + + childrens_parents: true | false # @ operator + + indirect_selection: eager | cautious | buildable | empty # include all tests selected indirectly? eager by default +``` + + + + The `*` operator to select all nodes can be written as: ```yml definition: @@ -113,11 +158,23 @@ Note: The `exclude` argument in YAML selectors is subtly different from the `--exclude` CLI argument. Here, `exclude` _always_ returns a [set difference](https://en.wikipedia.org/wiki/Complement_(set_theory)), and it is always applied _last_ within its scope. + + This gets us more intricate subset definitions than what's available on the CLI, where we can only pass one "yeslist" (`--select`) and one "nolist" (`--exclude`). + + + + +When more than one "yeslist" (`--select`) is passed, they are treated as a [union](/reference/node-selection/set-operators#unions) rather than an [intersection](/reference/node-selection/set-operators#intersections). Same thing when there is more than one "nolist" (`--exclude`). + + + #### Indirect selection + + As a general rule, dbt will indirectly select _all_ tests if they touch _any_ resource that you're selecting directly. We call this "eager" indirect selection. You can optionally switch the indirect selection mode to "cautious" by setting `indirect_selection` for a specific criterion: ```yml @@ -131,29 +188,70 @@ As a general rule, dbt will indirectly select _all_ tests if they touch _any_ re # if they have other unselected parents ``` -If provided, a yaml selector's `indirect_selection` value will take precedence over the CLI flag `--indirect-selection`. Because `indirect_selection` is defined separately for _each_ selection criterion, it's possible to mix eager/cautious modes within the same definition, to achieve the exact behavior that you need. Remember that you can always test out your critiera with `dbt ls --selector`. +If provided, a YAML selector's `indirect_selection` value will take precedence over the CLI flag `--indirect-selection`. Because `indirect_selection` is defined separately for _each_ selection criterion, it's possible to mix eager/cautious modes within the same definition, to achieve the exact behavior that you need. Remember that you can always test out your critiera with `dbt ls --selector`. -See [test selection examples](test-selection-examples) for more details about indirect selection. + -## Example + -Here are two ways to represent: +As a general rule, dbt will indirectly select _all_ tests if they touch _any_ resource that you're selecting directly. We call this "eager" indirect selection. You can optionally switch the indirect selection mode to "cautious" or "buildable" by setting `indirect_selection` for a specific criterion: - +```yml +- union: + - method: fqn + value: model_a + indirect_selection: eager # default: will include all tests that touch model_a + - method: fqn + value: model_b + indirect_selection: cautious # will not include tests touching model_b + # if they have other unselected parents + - method: fqn + value: model_c + indirect_selection: buildable # will not include tests touching model_c + # if they have other unselected parents (unless they have an ancestor that is selected) +``` - ```bash - $ dbt run --select @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing - ``` +If provided, a YAML selector's `indirect_selection` value will take precedence over the CLI flag `--indirect-selection`. Because `indirect_selection` is defined separately for _each_ selection criterion, it's possible to mix eager/cautious/buildable modes within the same definition, to achieve the exact behavior that you need. Remember that you can always test out your critiera with `dbt ls --selector`. - - ```bash - $ dbt run --models @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing - ``` + + +As a general rule, dbt will indirectly select _all_ tests if they touch _any_ resource that you're selecting directly. We call this "eager" indirect selection. You can optionally switch the indirect selection mode to "cautious", "buildable", or "empty" by setting `indirect_selection` for a specific criterion: + +```yml +- union: + - method: fqn + value: model_a + indirect_selection: eager # default: will include all tests that touch model_a + - method: fqn + value: model_b + indirect_selection: cautious # will not include tests touching model_b + # if they have other unselected parents + - method: fqn + value: model_c + indirect_selection: buildable # will not include tests touching model_c + # if they have other unselected parents (unless they have an ancestor that is selected) + - method: fqn + value: model_d + indirect_selection: empty # will include tests for only the selected node and ignore all tests attached to model_d +``` + +If provided, a YAML selector's `indirect_selection` value will take precedence over the CLI flag `--indirect-selection`. Because `indirect_selection` is defined separately for _each_ selection criterion, it's possible to mix eager/cautious/buildable/empty modes within the same definition, to achieve the exact behavior that you need. Remember that you can always test out your critiera with `dbt ls --selector`. +See [test selection examples](/reference/node-selection/test-selection-examples) for more details about indirect selection. + +## Example + +Here are two ways to represent: + + + ```bash + $ dbt run --select @source:snowplow,tag:nightly models/export --exclude package:snowplow,config.materialized:incremental export_performance_timing + ``` + ```yml + selectors: - name: nightly_diet_snowplow description: "Non-incremental Snowplow models that power nightly exports" definition: + + # Optional `union` and `intersection` keywords map to the ` ` and `,` set operators: union: - intersection: - '@source:snowplow' @@ -192,6 +293,7 @@ selectors: - name: nightly_diet_snowplow description: "Non-incremental Snowplow models that power nightly exports" definition: + # Optional `union` and `intersection` keywords map to the ` ` and `,` set operators: union: - intersection: - method: source @@ -217,14 +319,15 @@ selectors: Then in our job definition: ```bash -$ dbt run --selector nightly_diet_snowplow +dbt run --selector nightly_diet_snowplow ``` ## Default -Starting in v0.21, selectors may define a boolean `default` property. If a selector has `default: true`, dbt will use this selector's criteria when tasks do not define their own selection criteria. +Selectors may define a boolean `default` property. If a selector has `default: true`, dbt will use this selector's criteria when tasks do not define their own selection criteria. Let's say we define a default selector that only selects resources defined in our root project: + ```yml selectors: - name: root_project_only @@ -238,16 +341,18 @@ selectors: ``` If I run an "unqualified" command, dbt will use the selection criteria defined in `root_project_only`—that is, dbt will only build / freshness check / generate compiled SQL for resources defined in my root project. + ``` -$ dbt build -$ dbt source freshness -$ dbt docs generate +dbt build +dbt source freshness +dbt docs generate ``` If I run a command that defines its own selection criteria (via `--select`, `--exclude`, or `--selector`), dbt will ignore the default selector and use the flag criteria instead. It will not try to combine the two. -``` -$ dbt run --select model_a -$ dbt run --exclude model_a + +```bash +dbt run --select "model_a" +dbt run --exclude model_a ``` Only one selector may set `default: true` for a given invocation; otherwise, dbt will return an error. You may use a Jinja expression to adjust the value of `default` depending on the environment, however: @@ -288,4 +393,9 @@ selectors: value: buzz ``` +**Note:** While selector inheritance allows the logic from another selector to be _reused_, it doesn't allow the logic from that selector to be _modified_ by means of `parents`, `children`, `indirect_selection`, and so on. + +The `selector` method returns the complete set of nodes returned by the named selector. + + diff --git a/website/docs/reference/parsing.md b/website/docs/reference/parsing.md index 256c01dd094..1a68ba0d476 100644 --- a/website/docs/reference/parsing.md +++ b/website/docs/reference/parsing.md @@ -1,11 +1,12 @@ --- title: "Project Parsing" +description: "Read this guide to understand the project parsing configuration in dbt." --- ## Related documentation -- The `dbt parse` [command](parse) -- Partial parsing [profile config](profiles.yml#partial_parse) and [CLI flags](global-cli-flags#partial-parsing) -- Experimental parser [CLI flag](global-cli-flags#experimental-parser) +- The `dbt parse` [command](/reference/commands/parse) +- Partial parsing [profile config](/docs/core/connect-data-platform/profiles.yml#partial_parse) and [CLI flags](/reference/global-cli-flags#partial-parsing) +- Experimental parser [CLI flag](/reference/global-cli-flags#experimental-parser) ## What is parsing? @@ -15,7 +16,7 @@ Parsing projects can be slow, especially as projects get bigger—hundreds of mo - LibYAML bindings for PyYAML - Partial parsing, which avoids re-parsing unchanged files between invocations - An experimental parser, which extracts information from simple models much more quickly -- [RPC server](rpc), which keeps a manifest in memory, and re-parses the project at server startup/hangup +- [RPC server](/reference/commands/rpc), which keeps a manifest in memory, and re-parses the project at server startup/hangup These optimizations can be used in combination to reduce parse time from minutes to seconds. At the same time, each has some known limitations, so they are disabled by default. @@ -34,15 +35,15 @@ After parsing your project, dbt stores an internal project manifest in a file ca Starting in v1.0, partial parsing is **on** by default. In development, partial parsing can significantly reduce the time spent waiting at the start of a run, which translates to faster dev cycles and iteration. -The [`PARTIAL_PARSE` global config](global-configs#partial-parsing) can be enabled or disabled via `profiles.yml`, environment variable, or CLI flag. +The [`PARTIAL_PARSE` global config](/reference/global-configs/parsing) can be enabled or disabled via `profiles.yml`, environment variable, or CLI flag. ### Known limitations Parse-time attributes (dependencies, configs, and resource properties) are resolved using the parse-time context. When partial parsing is enabled, and certain context variables change, those attributes will _not_ be re-resolved, and are likely to become stale. -In particular, you may see **incorrect results** if these attributes depend on "volatile" context variables, such as [`run_started_at`](run_started_at), [`invocation_id`](invocation_id), or [flags](flags). These variables are likely (or even guaranteed!) to change in each invocation. We _highly discourage_ you from using these variables to set parse-time attributes (dependencies, configs, and resource properties). +In particular, you may see **incorrect results** if these attributes depend on "volatile" context variables, such as [`run_started_at`](/reference/dbt-jinja-functions/run_started_at), [`invocation_id`](/reference/dbt-jinja-functions/invocation_id), or [flags](/reference/dbt-jinja-functions/flags). These variables are likely (or even guaranteed!) to change in each invocation. We _highly discourage_ you from using these variables to set parse-time attributes (dependencies, configs, and resource properties). -Starting in v1.0, dbt _will_ detect changes in environment variables. It will selectively re-parse only the files that depend on that [`env_var`](env_var) value. (If the env var is used in `profiles.yml` or `dbt_project.yml`, a full re-parse is needed.) However, dbt will _not_ re-render **descriptions** that include env vars. If your descriptions include frequently changing env vars (this is highly uncommon), we recommend that you fully re-parse when generating documentation: `dbt --no-partial-parse docs generate`. +Starting in v1.0, dbt _will_ detect changes in environment variables. It will selectively re-parse only the files that depend on that [`env_var`](/reference/dbt-jinja-functions/env_var) value. (If the env var is used in `profiles.yml` or `dbt_project.yml`, a full re-parse is needed.) However, dbt will _not_ re-render **descriptions** that include env vars. If your descriptions include frequently changing env vars (this is highly uncommon), we recommend that you fully re-parse when generating documentation: `dbt --no-partial-parse docs generate`. If certain inputs change between runs, dbt will trigger a full re-parse. The results will be correct, but the full re-parse may be quite slow. Today those inputs are: - `--vars` @@ -50,7 +51,7 @@ If certain inputs change between runs, dbt will trigger a full re-parse. The res - `dbt_project.yml` content (or `env_var` values used within) - installed packages - dbt version -- certain widely-used macros, e.g. [builtins](builtins) overrides or `generate_x_name` for `database`/`schema`/`alias` +- certain widely-used macros, e.g. [builtins](/reference/dbt-jinja-functions/builtins) overrides or `generate_x_name` for `database`/`schema`/`alias` If you ever get into a bad state, you can disable partial parsing and trigger a full re-parse by setting the `PARTIAL_PARSE` global config to false, or by deleting `target/partial_parse.msgpack` (e.g. by running `dbt clean`). @@ -58,10 +59,10 @@ If you ever get into a bad state, you can disable partial parsing and trigger a At parse time, dbt needs to extract the contents of `ref()`, `source()`, and `config()` from all models in the project. Traditionally, dbt has extracted those values by rendering the Jinja in every model file, which can be slow. In v0.20, we introduced a new way to statically analyze model files, leveraging [`tree-sitter`](https://github.com/tree-sitter/tree-sitter), which we're calling an "experimental parser". You can see the code for an initial Jinja2 grammar [here](https://github.com/dbt-labs/tree-sitter-jinja2). -Starting in v1.0, the experimental parser is **on** by default. We believe it can offer *some* speedup to 95% of projects. You may optionally turn it off using the [`STATIC_PARSER` global config](global-configs#static-parser). +Starting in v1.0, the experimental parser is **on** by default. We believe it can offer *some* speedup to 95% of projects. You may optionally turn it off using the [`STATIC_PARSER` global config](/reference/global-configs/parsing). For now, the static parser only works with models, and models whose Jinja is limited to those three special macros (`ref`, `source`, `config`). The experimental parser is at least 3x faster than a full Jinja render. Based on testing with data from dbt Cloud, we believe the current grammar can statically parse 60% of models in the wild. So for the average project, we'd hope to see a 40% speedup in the model parser. ## Experimental parser -We plan to make iterative improvements to static parsing in future versions, and to use random sampling (via anonymous usage tracking) to verify that it yields correct results. You can opt into the latest "experimental" version of the static parser using the [`USE_EXPERIMENTAL_PARSER` global config](global-configs#experimental-parser). +We plan to make iterative improvements to static parsing in future versions, and to use random sampling (via anonymous usage tracking) to verify that it yields correct results. You can opt into the latest "experimental" version of the static parser using the [`USE_EXPERIMENTAL_PARSER` global config](/reference/global-configs/parsing). diff --git a/website/docs/reference/profiles.yml.md b/website/docs/reference/profiles.yml.md deleted file mode 100644 index 96f038f6782..00000000000 --- a/website/docs/reference/profiles.yml.md +++ /dev/null @@ -1,51 +0,0 @@ ---- - ---- - -If you're using the dbt CLI, you'll need to set up a `profiles.yml` file. - -You can learn more about this in the article on [Connecting to your warehouse](/docs/get-started/connection-profiles). - -This article lists the parts of your `profiles.yml` which are _not_ database specific. Check out the article for your database for exact connection details. - - - -```yml -[config](global-configs): - [send_anonymous_usage_stats](global-configs#send-anonymous-usage-stats): - [use_colors](global-configs#use-colors): - [partial_parse](global-configs#partial-parsing): - [printer_width](global-configs#printer-width): - [write_json](global-configs#writing-json-artifacts): - [warn_error](global-configs#warnings-as-errors): - [log_format](global-configs#log-formatting): - [debug](global-configs#debug-level-logging): - [version_check](global-configs#checking-version-compatibility): - [fail_fast](global-configs#failing-fast): - [use_experimental_parser](global-configs#experimental-parser): - [static_parser](global-configs#static-parser): - -: - target: # this is the default target - outputs: - : - type: - schema: - threads: - - ### database-specific connection details - ... - - : # additional targets - ... - -: # additional profiles - ... - -``` - - - -## User config - -You can set default values of global configs for all projects that you run using your local machine. See the docs on [global configs](global-configs) for details. diff --git a/website/docs/reference/programmatic-invocations.md b/website/docs/reference/programmatic-invocations.md new file mode 100644 index 00000000000..6afcd65c1bc --- /dev/null +++ b/website/docs/reference/programmatic-invocations.md @@ -0,0 +1,105 @@ +--- +title: "Programmatic invocations" +--- + +In v1.5, dbt-core added support for programmatic invocations. The intent is to expose the existing dbt CLI via a Python entry point, such that top-level commands are callable from within a Python script or application. + +The entry point is a `dbtRunner` class, which allows you to `invoke` the same commands as on the CLI. + +```python +from dbt.cli.main import dbtRunner, dbtRunnerResult + +# initialize +dbt = dbtRunner() + +# create CLI args as a list of strings +cli_args = ["run", "--select", "tag:my_tag"] + +# run the command +res: dbtRunnerResult = dbt.invoke(cli_args) + +# inspect the results +for r in res.result: + print(f"{r.node.name}: {r.status}") +``` + +## `dbtRunnerResult` + +Each command returns a `dbtRunnerResult` object, which has three attributes: +- `success` (bool): Whether the command succeeded. +- `result`: If the command completed (successfully or with handled errors), its result(s). Return type varies by command. +- `exception`: If the dbt invocation encountered an unhandled error and did not complete, the exception it encountered. + +There is a 1:1 correspondence between [CLI exit codes](/reference/exit-codes) and the `dbtRunnerResult` returned by a programmatic invocation: + +| Scenario | CLI Exit Code | `success` | `result` | `exception` | +|---------------------------------------------------------------------------------------------|--------------:|-----------|-------------------|-------------| +| Invocation completed without error | 0 | `True` | varies by command | `None` | +| Invocation completed with at least one handled error (e.g. test failure, model build error) | 1 | `False` | varies by command | `None` | +| Unhandled error. Invocation did not complete, and returns no results. | 2 | `False` | `None` | Exception | + +## Commitments & Caveats + +From dbt Core v1.5 onward, we making an ongoing commitment to providing a Python entry point at functional parity with dbt-core's CLI. We reserve the right to change the underlying implementation used to achieve that goal. We expect that the current implementation will unlock real use cases, in the short & medium term, while we work on a set of stable, long-term interfaces that will ultimately replace it. + +In particular, the objects returned by each command in `dbtRunnerResult.result` are not fully contracted, and therefore liable to change. Some of the returned objects are partially documented, because they overlap in part with the contents of [dbt artifacts](/reference/artifacts/dbt-artifacts). As Python objects, they contain many more fields and methods than what's available in the serialized JSON artifacts. These additional fields and methods should be considered **internal and liable to change in future versions of dbt-core.** + +## Advanced usage patterns + +:::caution +The syntax and support for these patterns are liable to change in future versions of `dbt-core`. +::: + +The goal of `dbtRunner` is to offer parity with CLI workflows, within a programmatic environment. There are a few advanced usage patterns that extend what's possible with the CLI. + +### Reusing objects + +Pass pre-constructed objects into `dbtRunner`, to avoid recreating those objects by reading files from disk. Currently, the only object supported is the `Manifest` (project contents). + +```python +from dbt.cli.main import dbtRunner, dbtRunnerResult +from dbt.contracts.graph.manifest import Manifest + +# use 'parse' command to load a Manifest +res: dbtRunnerResult = dbtRunner().invoke(["parse"]) +manifest: Manifest = res.result + +# introspect manifest +# e.g. assert every public model has a description +for node in manifest.nodes.values(): + if node.resource_type == "model" and node.access == "public": + assert node.description != "", f"{node.name} is missing a description" + +# reuse this manifest in subsequent commands to skip parsing +dbt = dbtRunner(manifest=manifest) +cli_args = ["run", "--select", "tag:my_tag"] +res = dbt.invoke(cli_args) +``` + +### Registering callbacks + +Register `callbacks` on dbt's `EventManager`, to access structured events and enable custom logging. The current behavior of callbacks is to block subsequent steps from proceeding; this functionality is not guaranteed in future versions. + +```python +from dbt.cli.main import dbtRunner +from dbt.events.base_types import EventMsg + +def print_version_callback(event: EventMsg): + if event.info.name == "MainReportVersion": + print(f"We are thrilled to be running dbt{event.data.version}") + +dbt = dbtRunner(callbacks=[print_version_callback]) +dbt.invoke(["list"]) +``` + +### Overriding parameters + +Pass in parameters as keyword arguments, instead of a list of CLI-style strings. At present, dbt will not do any validation or type coercion on your inputs. The subcommand must be specified, in a list, as the first positional argument. +```python +from dbt.cli.main import dbtRunner +dbt = dbtRunner() + +# these are equivalent +dbt.invoke(["--fail-fast", "run", "--select", "tag:my_tag"]) +dbt.invoke(["run"], select=["tag:my_tag"], fail_fast=True) +``` diff --git a/website/docs/reference/project-configs/analysis-paths.md b/website/docs/reference/project-configs/analysis-paths.md index 0597a22c2c5..5c3d223a5cb 100644 --- a/website/docs/reference/project-configs/analysis-paths.md +++ b/website/docs/reference/project-configs/analysis-paths.md @@ -1,5 +1,6 @@ --- datatype: [directorypath] +description: "Read this guide to understand the analysis-paths configuration in dbt." default_value: [] --- @@ -12,16 +13,16 @@ analysis-paths: [directorypath] ## Definition -Specify a custom list of directories where [analyses](analyses) are located. +Specify a custom list of directories where [analyses](/docs/build/analyses) are located. ## Default Without specifying this config, dbt will not compile any `.sql` files as analyses. -However, the [`dbt init` command](init) populates this value as `analyses` ([source](https://github.com/dbt-labs/dbt-starter-project/blob/HEAD/dbt_project.yml#L15)) +However, the [`dbt init` command](/reference/commands/init) populates this value as `analyses` ([source](https://github.com/dbt-labs/dbt-starter-project/blob/HEAD/dbt_project.yml#L15)) ## Examples ### Use a subdirectory named `analyses` -This is the value populated by the [`dbt init` command](init). +This is the value populated by the [`dbt init` command](/reference/commands/init). diff --git a/website/docs/reference/project-configs/asset-paths.md b/website/docs/reference/project-configs/asset-paths.md index 401a88d86c7..1fb3cf9f260 100644 --- a/website/docs/reference/project-configs/asset-paths.md +++ b/website/docs/reference/project-configs/asset-paths.md @@ -1,5 +1,6 @@ --- datatype: [directorypath] +description: "Read this guide to understand the asset-paths configuration in dbt." default_value: [] --- @@ -14,14 +15,8 @@ asset-paths: [directorypath] ## Definition Optionally specify a custom list of directories to copy to the `target` directory as part of the `docs generate` command. This is useful for rendering images in your repository in your project documentation. - - -* `v0.18.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. - - - ## Default -By default, dbt will not compile any additional files as part of `docs generate`, i.e. `analysis-paths: []` +By default, dbt will not copy any additional files as part of docs generate, i.e. `asset-paths: []` ## Examples ### Compile files in the `assets` subdirectory as part of `docs generate` diff --git a/website/docs/reference/project-configs/clean-targets.md b/website/docs/reference/project-configs/clean-targets.md index a8b9e2f8f14..9b464840723 100644 --- a/website/docs/reference/project-configs/clean-targets.md +++ b/website/docs/reference/project-configs/clean-targets.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [target_path] --- - - -- **v1.0.0:** The `modules-path` has been updated to be [`packages-install-path`](packages-install-path). The default value has also been updated to be `dbt-packages` from `dbt-modules`. - - - ```yml @@ -19,17 +13,17 @@ clean-targets: [directorypath] ## Definition -Optionally specify a custom list of directories to be removed by the `dbt clean` [command](clean). As such, you should only include directories containing artifacts (e.g. compiled files, logs, installed packages) in this list. +Optionally specify a custom list of directories to be removed by the `dbt clean` [command](/reference/commands/clean). As such, you should only include directories containing artifacts (e.g. compiled files, logs, installed packages) in this list. ## Default -If this configuration is not included in your `dbt_project.yml` file, the `clean` command will remove files in your [target-path](target-path). +If this configuration is not included in your `dbt_project.yml` file, the `clean` command will remove files in your [target-path](/reference/project-configs/target-path). ## Examples ### Remove packages and compiled files as part of `dbt clean` :::info -This is our preferred configuration +This is our preferred configuration, but is not the default. ::: -To remove packages as well as compiled files, include the value of your [packages-install-path](packages-install-path) configuration in your `clean-targets` configuration. +To remove packages as well as compiled files, include the value of your [packages-install-path](/reference/project-configs/packages-install-path) configuration in your `clean-targets` configuration. diff --git a/website/docs/reference/project-configs/config-version.md b/website/docs/reference/project-configs/config-version.md index 60e787513a5..804caf1328f 100644 --- a/website/docs/reference/project-configs/config-version.md +++ b/website/docs/reference/project-configs/config-version.md @@ -1,7 +1,14 @@ --- datatype: integer +description: "Read this guide to understand the config-version configuration in dbt." --- + + +Starting in dbt v1.5, the `config-version:` tag is optional. + + + ```yml @@ -13,11 +20,7 @@ config-version: 2 ## Definition Specify your `dbt_project.yml` as using the v2 structure. - - -* `v0.17.0`: This configuration was introduced — see the [migration guide](/guides/migration/versions) for more details. - - + This configuration is optional. ## Default Without this configuration, dbt will assume your `dbt_project.yml` uses the version 1 syntax, which was deprecated in dbt v0.19.0. diff --git a/website/docs/reference/project-configs/dispatch-config.md b/website/docs/reference/project-configs/dispatch-config.md index 6cb8060949c..a6656be13a4 100644 --- a/website/docs/reference/project-configs/dispatch-config.md +++ b/website/docs/reference/project-configs/dispatch-config.md @@ -1,5 +1,6 @@ --- title: dispatch (config) +description: "Read this guide to understand the dispatch configuration in dbt." datatype: list required: False --- @@ -18,7 +19,7 @@ dispatch: ## Definition -Optionally override the [dispatch](dispatch) search locations for macros in certain namespaces. If not specified, `dispatch` will look in your root project _first_, by default, and then look for implementations in the package named by `macro_namespace`. +Optionally override the [dispatch](/reference/dbt-jinja-functions/dispatch) search locations for macros in certain namespaces. If not specified, `dispatch` will look in your root project _first_, by default, and then look for implementations in the package named by `macro_namespace`. ## Examples @@ -37,7 +38,7 @@ dispatch: I've reimplemented certain macros from the `dbt_utils` package in my root project (`'my_root_project'`), and I want my versions to take precedence. Otherwise, fall back to the versions in `dbt_utils`. -_Note: As of v0.21.1, this is the default behavior. You may optionally choose to express that search order explicitly as:_ +_Note: This is the default behavior. You may optionally choose to express that search order explicitly as:_ diff --git a/website/docs/reference/project-configs/docs-paths.md b/website/docs/reference/project-configs/docs-paths.md index 8bb681596bd..2aee7b31ee7 100644 --- a/website/docs/reference/project-configs/docs-paths.md +++ b/website/docs/reference/project-configs/docs-paths.md @@ -1,5 +1,6 @@ --- datatype: [directorypath] +description: "Read this guide to understand the docs-paths configuration in dbt." default_value: [] --- @@ -12,11 +13,11 @@ docs-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [docs blocks](documentation#docs-blocks) are located. +Optionally specify a custom list of directories where [docs blocks](/docs/collaborate/documentation#docs-blocks) are located. ## Default -By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](model-paths), [seed-paths](seed-paths), [analysis-paths](analysis-paths), [macro-paths](macro-paths) and [snapshot-paths](snapshot-paths)). If this option is configured, dbt will _only_ look in the specified directory for docs blocks. +By default, dbt will search in all resource paths for docs blocks (i.e. the combined list of [model-paths](/reference/project-configs/model-paths), [seed-paths](/reference/project-configs/seed-paths), [analysis-paths](/reference/project-configs/analysis-paths), [macro-paths](/reference/project-configs/macro-paths) and [snapshot-paths](/reference/project-configs/snapshot-paths)). If this option is configured, dbt will _only_ look in the specified directory for docs blocks. ## Examples diff --git a/website/docs/reference/project-configs/log-path.md b/website/docs/reference/project-configs/log-path.md index 64b5d73ce85..29cad35d120 100644 --- a/website/docs/reference/project-configs/log-path.md +++ b/website/docs/reference/project-configs/log-path.md @@ -1,5 +1,6 @@ --- datatype: directorypath +description: "Read this guide to understand the log-path configuration in dbt." default_value: logs --- @@ -17,24 +18,50 @@ Optionally specify a custom directory where dbt will write logs. By default, dbt will write to the `logs` directory, i.e. `log-path: logs` + ## Configuration -In the manner of a ["global" config](global-configs), the log path can be set in three places: +In the manner of a ["global" config](/reference/global-configs/about-global-configs), the log path can be set in three places: 1. `--log-path` CLI flag 2. `DBT_LOG_PATH` environment variable 3. `log-path` in `dbt_project.yml` + + +:::warning Feature deprecation + +As of dbt version 1.5, setting the `log-path` in the `dbt_project.yml` is deprecated. Backward compatibility is still supported in 1.5 but will be removed in a future update. Migrate to the CLI flag or environment variable methods to avoid potential errors or disruptions. + +::: + +The precedence order is: CLI flag > env var > `dbt_project.yml(deprecated)` + + + + + The precedence order is: CLI flag > env var > `dbt_project.yml` + + ## Examples -### Write logs to a subdirectory named `dbt_logs` instead of `logs` +### Specify subdirectory using the project config file ```yml log-path: dbt_logs ``` - + + + +### Specify subdirectory from the command line + +```bash +dbt --log-path dbt_logs run +``` + + diff --git a/website/docs/reference/project-configs/macro-paths.md b/website/docs/reference/project-configs/macro-paths.md index 322a47a0838..486ec08ffdf 100644 --- a/website/docs/reference/project-configs/macro-paths.md +++ b/website/docs/reference/project-configs/macro-paths.md @@ -1,5 +1,6 @@ --- datatype: directorypath +description: "Read this guide to understand the macro-paths configuration in dbt." default_value: [macros] --- @@ -12,7 +13,7 @@ macro-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [macros](jinja-macros#macros) are located. Note that you cannot co-locate models and macros. +Optionally specify a custom list of directories where [macros](/docs/build/jinja-macros#macros) are located. Note that you cannot co-locate models and macros. ## Default By default, dbt will search for macros in a directory named `macros`, i.e. `macro-paths: ["macros"]` diff --git a/website/docs/reference/project-configs/model-paths.md b/website/docs/reference/project-configs/model-paths.md index 2129747af27..a0652432787 100644 --- a/website/docs/reference/project-configs/model-paths.md +++ b/website/docs/reference/project-configs/model-paths.md @@ -2,11 +2,6 @@ datatype: [directorypath] default_value: [models] --- - - -- **v1.0.0:** The config `source-paths` has been deprecated in favor of `model-paths`. - - diff --git a/website/docs/reference/project-configs/name.md b/website/docs/reference/project-configs/name.md index fd8e8a089c0..2d5d425a254 100644 --- a/website/docs/reference/project-configs/name.md +++ b/website/docs/reference/project-configs/name.md @@ -1,5 +1,6 @@ --- datatype: string +description: "Read this guide to understand the name configuration in dbt." required: True --- diff --git a/website/docs/reference/project-configs/on-run-start-on-run-end.md b/website/docs/reference/project-configs/on-run-start-on-run-end.md index 091122baa88..e1a3d7b761a 100644 --- a/website/docs/reference/project-configs/on-run-start-on-run-end.md +++ b/website/docs/reference/project-configs/on-run-start-on-run-end.md @@ -1,8 +1,11 @@ --- title: on-run-start & on-run-end +description: "Read this guide to understand the on-run-start and on-run-end configurations in dbt." datatype: sql-statement | [sql-statement] --- +import OnRunCommands from '/snippets/_onrunstart-onrunend-commands.md'; + ```yml @@ -14,54 +17,20 @@ on-run-end: sql-statement | [sql-statement] ## Definition -A SQL statement (or list of SQL statements) to be run at the start, or end, of the following commands: -- `dbt run` -- `dbt test` -- `dbt seed` -- `dbt snapshot` -- `dbt build` -- `dbt compile` -- `dbt docs generate` + +A SQL statement (or list of SQL statements) to be run at the start or end of the following commands: `on-run-start` and `on-run-end` hooks can also call macros that return SQL statements ## Usage notes -* The `on-run-end` hook has additional jinja variables available in the context — check out the [docs](on-run-end-context). +* The `on-run-end` hook has additional jinja variables available in the context — check out the [docs](/reference/dbt-jinja-functions/on-run-end-context). ## Examples - - - - -### Grant privileges at the end of a run - - - -```yml -on-run-end: "grant select on all tables in schema {{ target.schema }} group transformer" - -``` - - - -### Grant multiple privileges at the end of a run - - - -```yml -on-run-end: - - "grant usage on schema {{ target.schema }} to group reporter" - - "grant select on all tables in schema {{ target.schema }} group reporter" - -``` - - - - + ### Grant privileges on all schemas that dbt uses at the end of a run -This leverages the [schemas](schemas) variable that is only available in an `on-run-end` hook. +This leverages the [schemas](/reference/dbt-jinja-functions/schemas) variable that is only available in an `on-run-end` hook. @@ -85,4 +54,4 @@ on-run-end: "{{ grant_select(schemas) }}" ### Additional examples -We've compiled some more in-depth examples [here](hooks-operations#additional-examples). +We've compiled some more in-depth examples [here](/docs/build/hooks-operations#additional-examples). diff --git a/website/docs/reference/project-configs/packages-install-path.md b/website/docs/reference/project-configs/packages-install-path.md index 3ab1869ad48..157c630fd36 100644 --- a/website/docs/reference/project-configs/packages-install-path.md +++ b/website/docs/reference/project-configs/packages-install-path.md @@ -3,12 +3,6 @@ datatype: directorypath default_value: dbt_packages --- - - -- **v1.0.0:** The default config has changed from `modules-path` to `packages-install-path` with a new default value of `dbt_packages`. - - - ```yml @@ -18,7 +12,7 @@ packages-install-path: directorypath ## Definition -Optionally specify a custom directory where [packages](/docs/build/packages) are installed when you run the `dbt deps` [command](deps). Note that this directory is usually git-ignored. +Optionally specify a custom directory where [packages](/docs/build/packages) are installed when you run the `dbt deps` [command](/reference/commands/deps). Note that this directory is usually git-ignored. ## Default By default, dbt will install packages in the `dbt_packages` directory, i.e. `packages-install-path: dbt_packages` @@ -29,7 +23,7 @@ By default, dbt will install packages in the `dbt_packages` directory, i.e. `pac ```yml -module-path: packages +packages-install-path: packages ``` diff --git a/website/docs/reference/project-configs/profile.md b/website/docs/reference/project-configs/profile.md index 9ea4cd55d63..39be0cea109 100644 --- a/website/docs/reference/project-configs/profile.md +++ b/website/docs/reference/project-configs/profile.md @@ -1,5 +1,6 @@ --- datatype: string +description: "Read this guide to understand the profile configuration in dbt." --- @@ -11,18 +12,18 @@ profile: string ## Definition The profile your dbt project should use to connect to your . -* If you are developing in dbt Cloud: This configuration is optional +* If you are developing in dbt Cloud: This configuration is not applicable * If you are developing locally: This configuration is required, unless a command-line option (i.e. `--profile`) is supplied. ## Related guides -* [Connecting to your warehouse using the command line](/docs/get-started/connection-profiles#connecting-to-your-warehouse-using-the-command-line) +* [Connecting to your warehouse using the command line](/docs/core/connect-data-platform/connection-profiles#connecting-to-your-warehouse-using-the-command-line) ## Recommendation Often an organization has only one , so it is sensible to use your organization's name as a profile name, in `snake_case`. For example: * `profile: acme` * `profile: jaffle_shop` -It is also reasonable to include the name of your warehouse technology in your profile name, partiuclarly if you have multiple warehouses. For example: +It is also reasonable to include the name of your warehouse technology in your profile name, particularly if you have multiple warehouses. For example: * `profile: acme_snowflake` * `profile: jaffle_shop_bigquery` * `profile: jaffle_shop_redshift` diff --git a/website/docs/reference/project-configs/query-comment.md b/website/docs/reference/project-configs/query-comment.md index 9b9eae99a31..b1a73605e55 100644 --- a/website/docs/reference/project-configs/query-comment.md +++ b/website/docs/reference/project-configs/query-comment.md @@ -30,14 +30,6 @@ A string to inject as a comment in each query that dbt runs against your databas The `query-comment` configuration can also call a macro that returns a string. - - -* `v0.15.0`: The `query-comment` configuration was introduced -* `v0.16.1`: Dictionary syntax introduced to allow comments to be appended -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - ## Default By default, dbt will insert a comment at the top of your query containing the information including the dbt version, profile and target names, and node ids for the resources it runs. For example: @@ -149,13 +141,6 @@ select ... ### BigQuery: include query comment items as job labels - - - -* `v0.20.0:` Introduced `job-label` argument for BigQuery job labels - - - If `query-comment.job-label` is set to true, dbt will include the query comment items, if a dictionary, or the comment string, as job labels on the query it executes. These will be included in addition to labels specified in the [BigQuery-specific config](/reference/project-configs/query-comment#bigquery-include-query-comment-items-as-job-labels). @@ -209,7 +194,7 @@ The `query-comment` config can reference macros in your dbt project. Simply crea -Then call the macro in your `dbt_project.yml` file. Make sure you quote the macro to avoid the yaml parser from trying to interpret the `{` as the start of a dictionary. +Then call the macro in your `dbt_project.yml` file. Make sure you quote the macro to avoid the YAML parser from trying to interpret the `{` as the start of a dictionary. @@ -276,14 +261,14 @@ The following context variables are available when generating a query comment: | Context Variable | Description | | ---------------- | ----------- | | dbt_version | The version of dbt being used | -| env_var | See [env_var](env_var) | -| modules | See [modules](modules) | +| env_var | See [env_var](/reference/dbt-jinja-functions/env_var) | +| modules | See [modules](/reference/dbt-jinja-functions/modules) | | run_started_at | When the dbt invocation began | | invocation_id | A unique ID for the dbt invocation | -| fromjson | See [fromjson](fromjson) | -| tojson | See [tojson](tojson) | -| log | See [log](log) | -| var | See [var](var) | -| target | See [target](target) | +| fromjson | See [fromjson](/reference/dbt-jinja-functions/fromjson) | +| tojson | See [tojson](/reference/dbt-jinja-functions/tojson) | +| log | See [log](/reference/dbt-jinja-functions/log) | +| var | See [var](/reference/dbt-jinja-functions/var) | +| target | See [target](/reference/dbt-jinja-functions/target) | | connection_name | A string representing the internal name for the connection. This string is generated by dbt. | | node | A dictionary representation of the parsed node object. Use `node.unique_id`, `node.database`, `node.schema`, etc | diff --git a/website/docs/reference/project-configs/quoting.md b/website/docs/reference/project-configs/quoting.md index 5c89183ddc8..821b920188c 100644 --- a/website/docs/reference/project-configs/quoting.md +++ b/website/docs/reference/project-configs/quoting.md @@ -1,5 +1,8 @@ --- +title: "Configuring quoting in projects" +sidebar_label: "quoting" datatype: boolean # -ish, it's actually a dictionary of bools +description: "Read this guide to understand the quoting configuration in dbt." default: true --- @@ -25,13 +28,6 @@ Note that for BigQuery quoting configuration, `database` and `schema` should be ::: - - -* `v0.10.1`: This configuration was introduced with a default value of `true` for each adapter. -* `v0.11.0`: The default quoting config on Snowflake changed from `true` to `false` - - - ## Default The default values vary by database. @@ -109,6 +105,10 @@ create table analytics.dbt_alice.dim_customers ### Snowflake Set all quoting configs to `False`. This means that you cannot use reserved words as identifiers, however it's usually a good idea to avoid these reserved words anyway. +If a Snowflake source table uses a quoted database, schema, or table identifier, you can configure it in the source.yml file. [Refer to configuring quoting for more info](/reference/resource-properties/quoting). + + + #### Explanation: Whereas most databases will _lowercase_ unquoted identifiers, Snowflake will _uppercase_ unquoted identifiers. If a model name is lowercased _and quoted_, then it cannot be referred to without quotes! Check out the example below for more information. diff --git a/website/docs/reference/project-configs/require-dbt-version.md b/website/docs/reference/project-configs/require-dbt-version.md index ccea51f833a..85a502bff60 100644 --- a/website/docs/reference/project-configs/require-dbt-version.md +++ b/website/docs/reference/project-configs/require-dbt-version.md @@ -1,5 +1,6 @@ --- datatype: version-range | [version-range] +description: "Read this guide to understand the require-dbt-version configuration in dbt." default_value: None --- @@ -18,15 +19,9 @@ When you set this configuration, dbt sends a helpful error message for any user If this configuration is not specified, no version check will occur. - - -* `v0.13.0`: This configuration was introduced - - - :::info YAML Quoting -This configuration needs to be interpolated by the yaml parser as a string. As such, you should quote the value of the configuration, taking care to avoid whitespace. For example: +This configuration needs to be interpolated by the YAML parser as a string. As such, you should quote the value of the configuration, taking care to avoid whitespace. For example: ```yml # ✅ These will work require-dbt-version: ">=1.0.0" # Double quotes are OK @@ -117,7 +112,7 @@ Running with dbt=0.21.0 Found 13 models, 2 tests, 1 archives, 0 analyses, 204 macros, 2 operations.... ``` -See [global configs](global-configs#checking-version-compatibility) for usage details. +See [global configs](/reference/global-configs/version-compatibility) for usage details. ## Recommendation * This is a recommended configuration diff --git a/website/docs/reference/project-configs/seed-paths.md b/website/docs/reference/project-configs/seed-paths.md index 92f7c5aa91f..614bda62cd2 100644 --- a/website/docs/reference/project-configs/seed-paths.md +++ b/website/docs/reference/project-configs/seed-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [data] --- - - -- **v1.0.0:** The config `data-paths` has been deprecated in favor of `seed-paths`. - - - ```yml diff --git a/website/docs/reference/project-configs/snapshot-paths.md b/website/docs/reference/project-configs/snapshot-paths.md index 6aa216cc8fd..81b2759609d 100644 --- a/website/docs/reference/project-configs/snapshot-paths.md +++ b/website/docs/reference/project-configs/snapshot-paths.md @@ -1,5 +1,6 @@ --- datatype: [directorypath] +description: "Read this guide to understand the snapshot-paths configuration in dbt." default_value: [snapshots] --- @@ -11,13 +12,7 @@ snapshot-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [snapshots](snapshots) are located. Note that you cannot co-locate models and snapshots. - - - -* `v0.14.0`: Snapshots were introduced - - +Optionally specify a custom list of directories where [snapshots](/docs/build/snapshots) are located. Note that you cannot co-locate models and snapshots. ## Default By default, dbt will search for snapshots in the `snapshots` directory, i.e. `snapshot-paths: ["snapshots"]` diff --git a/website/docs/reference/project-configs/target-path.md b/website/docs/reference/project-configs/target-path.md index 82956d9688d..fddc5a93c5e 100644 --- a/website/docs/reference/project-configs/target-path.md +++ b/website/docs/reference/project-configs/target-path.md @@ -5,7 +5,7 @@ default_value: target ```yml -target-path: [directorypath] +target-path: directorypath ``` @@ -14,28 +14,56 @@ target-path: [directorypath] Optionally specify a custom directory where compiled files (e.g. compiled models and tests) will be written when you run the `dbt run`, `dbt compile`, or `dbt test` command. + ## Default By default, dbt will write compiled files to the `target` directory, i.e. `target-path: target` + ## Configuration -In the manner of a ["global" config](global-configs), the target path can be set in three places: +In the manner of a ["global" config](/reference/global-configs/about-global-configs), the target path can be set in three places: 1. `--target-path` CLI flag 2. `DBT_TARGET_PATH` environment variable 3. `target-path` in `dbt_project.yml` + + +:::warning Feature deprecation + +As of dbt version 1.5, setting the `target-path` in the `dbt_project.yml` is deprecated. Backward compatibility is still supported in 1.5 but will be removed in a future update. Migrate to the CLI flag or environment variable methods to avoid potential errors or disruptions. + +::: + +The precedence order is: CLI flag > env var > `dbt_project.yml(deprecated)` + + + + + The precedence order is: CLI flag > env var > `dbt_project.yml` + + ## Examples -### Use a subdirectory named `compiled` for compiled files +### Specify subdirectory using the project config file ```yml -target-path: "compiled" +target-path: "compiled_files" ``` + + + +### Specify subdirectory from the command line + +```bash +dbt run --target-path compiled_files +``` + + \ No newline at end of file diff --git a/website/docs/reference/project-configs/test-paths.md b/website/docs/reference/project-configs/test-paths.md index eec360663d9..e3d0e0b76fa 100644 --- a/website/docs/reference/project-configs/test-paths.md +++ b/website/docs/reference/project-configs/test-paths.md @@ -3,12 +3,6 @@ datatype: [directorypath] default_value: [test] --- - - -* `v1.0.0`: Generic tests can be defined in the `tests/generic` subfolder, in addition to the `macros/` directory - - - ```yml @@ -18,7 +12,9 @@ test-paths: [directorypath] ## Definition -Optionally specify a custom list of directories where [data tests](/docs/build/tests) are located. + +Optionally specify a custom list of directories where [singular tests](/docs/build/tests) are located. + ## Default Without specifying this config, dbt will search for tests in the `tests` directory, i.e. `test-paths: ["tests"]`. Specifically, it will look for `.sql` files containing: diff --git a/website/docs/reference/project-configs/version.md b/website/docs/reference/project-configs/version.md index 346f1f85fef..1c947412fcd 100644 --- a/website/docs/reference/project-configs/version.md +++ b/website/docs/reference/project-configs/version.md @@ -1,13 +1,22 @@ --- datatype: version required: True +keyword: project version, project versioning, dbt project versioning --- +import VersionsCallout from '/snippets/_version-callout.md'; + + + + + dbt projects have two distinct types of the `version` tags. This field has a different meaning depending on its location. ## `dbt_project.yml` versions -The version tag in a `dbt_project` file represents the version of your dbt project. Although **this is a required parameter**, it is not currently meaningfully used by dbt. The version must be in a [semantic version](https://semver.org/) format, e.g. `1.0.0`. For more on Core versions, see [About dbt Core versions](/docs/dbt-versions/core). +The version tag in a `dbt_project` file represents the version of your dbt project. Starting in version 1.5, `version` in the `dbt_project.yml` is an *optional parameter*. If specified, the version must be in a [semantic version](https://semver.org/) format, e.g. `1.0.0`. The default value if not specified is `None`. + +For more on Core versions, see [About dbt Core versions](/docs/dbt-versions/core). ```yml @@ -16,18 +25,86 @@ version: version +## `.yml` property file versions + +A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files. + +Starting from version 1.5, dbt will no longer require this configuration in your resource `.yml` files. If you want to know more about why this tag was previously required, you can refer to the [property file FAQs](reference/configs-and-properties#faqs). -The version must be in a [semantic version](https://semver.org/) format, e.g. `1.0.0` +For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page. + + + + + + +```yml +version: 2 # Only 2 is accepted by dbt versions up to 1.4.latest. -## `.yml` property file versions +models: + ... +``` + + + + + + + + + +```yml + +models: + ... +``` + + + + + + + + + + + + +dbt projects have two distinct types of `version` tags. This field has a different meaning depending on its location. + +## `dbt_project.yml` versions + +The version tag in a `dbt_project` file represents the version of your dbt project and **is a required parameter**. However, it isn't currently used in a meaningful way by dbt. The version must follow a [semantic version](https://semver.org/) format, such as 1.0.0. For more information about dbt Core versions, refer to [About dbt Core versions](/docs/dbt-versions/core). + + +```yml +version: version +``` + + + +## `.yml` property file versions A version tag in a `.yml` property file provides the control tag, which informs how dbt processes property files. For more on why we require this tag, see property file [FAQs](reference/configs-and-properties#faqs). For more on property files, see their general [documentation](reference/configs-and-properties#where-can-i-define-properties) on the same page. + ```yml version: 2 # Only 2 is accepted by current and recent versions of dbt. + +models: + ... ``` + + diff --git a/website/docs/reference/references-overview.md b/website/docs/reference/references-overview.md new file mode 100644 index 00000000000..85a374c5aa3 --- /dev/null +++ b/website/docs/reference/references-overview.md @@ -0,0 +1,59 @@ +--- +title: "About References" +id: "references-overview" +sidebar_label: "About References" +description: "Connect dbt to any data platform in dbt Cloud or dbt Core, using a dedicated adapter plugin" +hide_table_of_contents: true +pagination_next: null +pagination_prev: null +--- + +The References section contains reference materials for developing with dbt, which includes dbt Cloud and dbt Core. + +Learn how to add more configurations to your dbt project or adapter, use properties for extra ability, refer to dbt commands, use powerful Jinja functions to streamline your dbt project, and understand how to use dbt artifacts.
          + +
          + + + + + + + + + + + + + + + +
          diff --git a/website/docs/reference/resource-configs/access.md b/website/docs/reference/resource-configs/access.md new file mode 100644 index 00000000000..da50e48d2f0 --- /dev/null +++ b/website/docs/reference/resource-configs/access.md @@ -0,0 +1,97 @@ +--- +resource_types: [models] +datatype: access +--- + + + +```yml +version: 2 + +models: + - name: model_name + access: private | protected | public +``` + + + + + +Access modifiers may be applied to models one-by-one in YAML properties. In v1.5 and v1.6, you are unable to configure `access` for multiple models at once. Upgrade to v1.7 for additional configuration options. A group or subfolder contains models with varying access levels, so when you designate a model with `access: public`, make sure you intend for this behavior. + + + + + +You can apply access modifiers in config files, including `the dbt_project.yml`, or to models one-by-one in YAML properties. Applying access configs to a subfolder modifies the default for all models in that subfolder, so make sure you intend for this behavior. When setting individual model access, a group or subfolder might contain a variety of access levels, so when you designate a model with `access: public` make sure you intend for this behavior. + +There are multiple approaches to configuring access: + +In the model configs of `dbt_project.yml``: + +```yaml +models: + - name: my_public_model + access: public # Older method, still supported + +``` +Or (but not both) + +```yaml +models: + - name: my_public_model + config: + access: public # newly supported in v1.7 + +``` + +In a subfolder: +```yaml +models: + my_project_name: + subfolder_name: + +group: + +access: private # sets default for all models in this subfolder +``` + +In the model.sql file: + +```sql +-- models/my_public_model.sql + +{{ config(access = "public") }} + +select ... +``` + + + +## Definition +The access level of the model you are declaring properties for. + +Some models (not all) are designed to be referenced through the [ref](/reference/dbt-jinja-functions/ref) function across [groups](/docs/build/groups). + +| Access | Referenceable by | +|-----------|-------------------------------| +| private | same group | +| protected | same project/package | +| public | any group, package or project | + +If you try to reference a model outside of its supported access, you will see an error: + +```shell +dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` + +## Default + +By default, all models are "protected." This means that other models in the same project can reference them. + +## Related docs + +* [Model Access](/docs/collaborate/govern/model-access#groups) +* [Group configuration](/reference/resource-configs/group) diff --git a/website/docs/reference/resource-configs/alias.md b/website/docs/reference/resource-configs/alias.md index cdf832f643d..40da45ebcd1 100644 --- a/website/docs/reference/resource-configs/alias.md +++ b/website/docs/reference/resource-configs/alias.md @@ -1,5 +1,6 @@ --- resource_types: [models, seeds, snapshots, tests] +description: "Read this guide to understand the alias configuration in dbt." datatype: string --- diff --git a/website/docs/reference/resource-configs/bigquery-configs.md b/website/docs/reference/resource-configs/bigquery-configs.md index 357082f2ebb..fe4d7d65a22 100644 --- a/website/docs/reference/resource-configs/bigquery-configs.md +++ b/website/docs/reference/resource-configs/bigquery-configs.md @@ -1,5 +1,6 @@ --- title: "BigQuery configurations" +description: "Read this guide to understand BigQuery configurations in dbt." id: "bigquery-configs" --- @@ -20,26 +21,6 @@ This will allow you to read and write from multiple BigQuery projects. Same for ### Partition clause - - -Before dbt v0.16.0, the `partition_by` configuration was supplied as string. While -the string specification syntax is still supported in dbt v0.16.0, it has been -deprecated and will be removed in a future release. **Note:** partitioning configs -using a range bucket *must* be supplied using the dictionary-style configuration as of -dbt v0.16.0. - -Example usage for versions of dbt < 0.16.0: - -```sql --- Partitioning by a timestamp field -{{ config( materialized='table', partition_by="date(created_at)" ) }} - --- Partitioning by a date field -{{ config( materialized='table', partition_by="created_date" ) }} -``` - - - BigQuery supports the use of a [partition by](https://cloud.google.com/bigquery/docs/data-definition-language#specifying_table_partitioning_options) clause to easily partition a by a column or expression. This option can help decrease latency and cost when querying large tables. Note that partition pruning [only works](https://cloud.google.com/bigquery/docs/querying-partitioned-tables#pruning_limiting_partitions) when partitions are filtered using literal values (so selecting partitions using a won't improve performance). The `partition_by` config can be supplied as a dictionary with the following format: @@ -60,7 +41,6 @@ The `partition_by` config can be supplied as a dictionary with the following for ``` #### Partitioning by a date or timestamp -Partitioning by hour, month or year is new in v0.19.0 When using a `datetime` or `timestamp` column to partition data, you can create partitions with a granularity of hour, day, month, or year. A `date` column supports granularity of day, month and year. Daily partitioning is the default for all column types. @@ -104,7 +84,7 @@ from {{ ref('events') }} ```sql -create table analytics.bigquery_table +create table `projectname`.`analytics`.`bigquery_table` partition by timestamp_trunc(created_at, day) as ( @@ -113,7 +93,7 @@ as ( event_name, created_at - from analytics.events + from `analytics`.`events` ) ``` @@ -123,6 +103,77 @@ as (
          + + +#### Partitioning by an "ingestion" date or timestamp + +BigQuery supports an [older mechanism of partitioning](https://cloud.google.com/bigquery/docs/partitioned-tables#ingestion_time) based on the time when each row was ingested. While we recommend using the newer and more ergonomic approach to partitioning whenever possible, for very large datasets, there can be some performance improvements to using this older, more mechanistic approach. [Read more about the `insert_overwrite` incremental strategy below](#copying-ingestion-time-partitions). + +dbt will always instruct BigQuery to partition your table by the values of the column specified in `partition_by.field`. By configuring your model with `partition_by.time_ingestion_partitioning` set to `True`, dbt will use that column as the input to a `_PARTITIONTIME` pseudocolumn. Unlike with newer column-based partitioning, you must ensure that the values of your partitioning column match exactly the time-based granularity of your partitions. + + + + + + +```sql +{{ config( + materialized="incremental", + partition_by={ + "field": "created_date", + "data_type": "timestamp", + "granularity": "day", + "time_ingestion_partitioning": true + } +) }} + +select + user_id, + event_name, + created_at, + -- values of this column must match the data type + granularity defined above + timestamp_trunc(created_at, day) as created_date + +from {{ ref('events') }} +``` + + + + + + + + +```sql +create table `projectname`.`analytics`.`bigquery_table` (`user_id` INT64, `event_name` STRING, `created_at` TIMESTAMP) +partition by timestamp_trunc(_PARTITIONTIME, day); + +insert into `projectname`.`analytics`.`bigquery_table` (_partitiontime, `user_id`, `event_name`, `created_at`) +select created_date as _partitiontime, * EXCEPT(created_date) from ( + select + user_id, + event_name, + created_at, + -- values of this column must match granularity defined above + timestamp_trunc(created_at, day) as created_date + + from `projectname`.`analytics`.`events` +); +``` + + + + + + + + #### Partitioning with integer buckets If the `data_type` is specified as `int64`, then a `range` key must also @@ -194,12 +245,6 @@ as ( #### Additional partition configs - - - - **v0.20.0:** Introduced `require_partition_filter` and `partition_expiration_days` - - - If your model has `partition_by` configured, you may optionally specify two additional configurations: - `require_partition_filter` (boolean): If set to `true`, anyone querying this model _must_ specify a partition filter, otherwise their query will fail. This is recommended for very large tables with obvious partitioning schemes, such as event streams grouped by day. Note that this will affect other dbt models or tests that try to select from this model, too. @@ -295,9 +340,7 @@ dbt supports the specification of BigQuery labels for the tables and BigQuery key-value pair entries for labels larger than 63 characters are truncated. **Configuring labels in a model file** @@ -371,12 +414,12 @@ models: columns: - name: field policy_tags: - - 'need_to_know' + - 'projects//locations//taxonomies//policyTags/' ``` -Please note that in order for policy tags to take effect, [column-level `persist_docs`](https://docs.getdbt.com/reference/resource-configs/persist_docs) must be enabled for the model, seed, or snapshot. +Please note that in order for policy tags to take effect, [column-level `persist_docs`](/reference/resource-configs/persist_docs) must be enabled for the model, seed, or snapshot. Consider using [variables](/docs/build/project-variables) to manage taxonomies and make sure to add the required security [roles](https://cloud.google.com/bigquery/docs/column-level-security-intro#roles) to your BigQuery service account key. ## Merge behavior (incremental models) @@ -408,22 +451,13 @@ when matched then update ... when not matched then insert ... ``` -The `merge` approach has the benefit of automatically updating any late-arriving facts in the -destination incremental table. The drawback of this approach is that BigQuery must scan all -source tables referenced in the model SQL, as well as the entirety of the destination table. -This can be slow and costly if the incremental model is transforming very large amounts of data. +The 'merge' approach automatically updates new data in the destination incremental table but requires scanning all source tables referenced in the model SQL, as well as destination tables. This can be slow and expensive for large data volumes. [Partitioning and clustering](#using-table-partitioning-and-clustering) techniques mentioned earlier can help mitigate these issues. **Note:** The `unique_key` configuration is required when the `merge` incremental strategy is selected. ### The `insert_overwrite` strategy - - - - **v0.16.0:** Introduced `insert_overwrite` incremental strategy - - - The `insert_overwrite` strategy generates a merge statement that replaces entire partitions in the destination table. **Note:** this configuration requires that the model is configured with a [Partition clause](#partition-clause). The `merge` statement that dbt generates @@ -501,7 +535,7 @@ with events as ( {% if is_incremental() %} -- recalculate yesterday + today - where date(event_timestamp) in ({{ partitions_to_replace | join(',') }}) + where timestamp_trunc(event_timestamp, day) in ({{ partitions_to_replace | join(',') }}) {% endif %} ), @@ -516,12 +550,6 @@ _today_ and _yesterday_ every day that it is run. It is the fastest and cheapest way to incrementally update a table using dbt. If we wanted this to run more dynamically— let’s say, always for the past 3 days—we could leverage dbt’s baked-in [datetime macros](https://github.com/dbt-labs/dbt-core/blob/dev/octavius-catto/core/dbt/include/global_project/macros/etc/datetime.sql) and write a few of our own. - - - - **v0.19.0:** With the advent of truncated timestamp partitions in BigQuery, `timestamp`-type partitions are now treated as timestamps instead of dates for the purposes of filtering. Update `partitions_to_replace` accordingly. - - - Think of this as "full control" mode. You must ensure that expressions or literal values in the the `partitions` config have proper quoting when templated, and that they match the `partition_by.data_type` (`timestamp`, `datetime`, `date`, or `int64`). Otherwise, the filter in the incremental `merge` statement will raise an error. #### Dynamic partitions @@ -566,17 +594,67 @@ with events as ( ... rest of model ... ``` + + +#### Copying ingestion-time partitions + +If you have configured your incremental model to use "ingestion"-based partitioning (`partition_by.time_ingestion_partitioning: True`), you can opt to use a legacy mechanism for inserting and overwriting partitions. While this mechanism doesn't offer the same visibility and ease of debugging as the SQL `merge` statement, it can yield significant savings in time and cost for large datasets. Behind the scenes, dbt will add or replace each partition via the [copy table API](https://cloud.google.com/bigquery/docs/managing-tables#copy-table) and partition decorators. + +You can enable this by switching on `copy_partitions: True` in the `partition_by` configuration. This approach works only in combination with "dynamic" partition replacement. + + + +```sql +{{ config( + materialized="incremental", + incremental_strategy="insert_overwrite", + partition_by={ + "field": "created_date", + "data_type": "timestamp", + "granularity": "day", + "time_ingestion_partitioning": true, + "copy_partitions": true + } +) }} + +select + user_id, + event_name, + created_at, + -- values of this column must match the data type + granularity defined above + timestamp_trunc(created_at, day) as created_date + +from {{ ref('events') }} +``` + + + + + +``` +... +[0m16:03:13.017641 [debug] [Thread-3 (]: BigQuery adapter: Copying table(s) "/projects/projectname/datasets/analytics/tables/bigquery_table$20230112" to "/projects/projectname/datasets/analytics/tables/bigquery_table$20230112" with disposition: "WRITE_TRUNCATE" +... +``` + + + + + ## Controlling table expiration -New in v0.18.0 By default, dbt-created tables never expire. You can configure certain model(s) to expire after a set number of hours by setting `hours_to_expiration`. +:::info Note +The `hours_to_expiration` only applies to initial creation of the underlying table. It doesn't reset for incremental models when they do another run. +::: + ```yml models: - [](resource-path): + [](/reference/resource-configs/resource-path): +hours_to_expiration: 6 ``` @@ -599,20 +677,18 @@ select ... ## Authorized Views -New in v0.18.0 - If the `grant_access_to` config is specified for a model materialized as a view, dbt will grant the view model access to select from the list of datasets provided. See [BQ docs on authorized views](https://cloud.google.com/bigquery/docs/share-access-views) for more details. - + ```yml models: - [](resource-path): + [](/reference/resource-configs/resource-path): +grant_access_to: - project: project_1 dataset: dataset_1 @@ -641,3 +717,4 @@ Views with this configuration will be able to select from objects in `project_1. #### Limitations The `grant_access_to` config is not thread-safe when multiple views need to be authorized for the same dataset. The initial `dbt run` operation after a new `grant_access_to` config is added should therefore be executed in a single thread. Subsequent runs using the same configuration will not attempt to re-apply existing access grants, and can make use of multiple threads. + diff --git a/website/docs/reference/resource-configs/check_cols.md b/website/docs/reference/resource-configs/check_cols.md index 727574a0927..50a34b6a87e 100644 --- a/website/docs/reference/resource-configs/check_cols.md +++ b/website/docs/reference/resource-configs/check_cols.md @@ -1,5 +1,6 @@ --- resource_types: [snapshots] +description: "Read this guide to understand the check_cols configuration in dbt." datatype: "[column_name] | all" --- @@ -19,9 +20,9 @@ datatype: "[column_name] | all" ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +strategy: check - +unique_key: [column_name] | all + +check_cols: [column_name] | all ``` @@ -32,7 +33,7 @@ A list of columns within the results of your snapshot query to check for changes Alternatively, use all columns using the `all` value (however this may be less performant). -This parameter is **required if using the `check` [strategy](strategy)**. +This parameter is **required if using the `check` [strategy](/reference/resource-configs/strategy)**. ## Default No default is provided. diff --git a/website/docs/reference/resource-configs/clickhouse-configs.md b/website/docs/reference/resource-configs/clickhouse-configs.md index 9057b6cc228..acf5f1706ba 100644 --- a/website/docs/reference/resource-configs/clickhouse-configs.md +++ b/website/docs/reference/resource-configs/clickhouse-configs.md @@ -1,5 +1,6 @@ --- title: "ClickHouse configurations" +description: "Read this guide to understand ClickHouse configurations in dbt." id: "clickhouse-configs" --- @@ -10,6 +11,7 @@ id: "clickhouse-configs" | view materialization | YES | Creates a [view](https://clickhouse.com/docs/en/sql-reference/table-functions/view/). | | table materialization | YES | Creates a [table](https://clickhouse.com/docs/en/operations/system-tables/tables/). See below for the list of supported engines. | | incremental materialization | YES | Creates a table if it doesn't exist, and then writes only updates to it. | +| ephemeral materialized | YES | Creates a ephemeral/CTE materialization. This does model is internal to dbt and does not create any database objects | ### View Materialization @@ -154,14 +156,16 @@ models: #### Incremental Table Configuration -| Option | Description | Required? | -|----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| `materialized` | How the model will be materialized into ClickHouse. Must be `table` to create a table model. | Required | -| `unique_key` | A tuple of column names that uniquely identify rows. For more details on uniqueness constraints, see [here](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#defining-a-uniqueness-constraint-optional). | Required. If not provided altered rows will be added twice to the incremental table. | -| `engine` | The table engine to use when creating tables. See list of supported engines below. | Optional (default: `MergeTree()`) | -| `order_by` | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | -| `partition_by` | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | -| `inserts_only` | If set to True, incremental updates will be inserted directly to the target incremental table without creating intermediate table. Read more about this configuration in our [doc](https://clickhouse.com/docs/en/integrations/dbt/dbt-incremental-model#inserts-only-mode) | Optional (default: `False`) | +| Option | Description | Required? | +|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| +| `materialized` | How the model will be materialized into ClickHouse. Must be `table` to create a table model. | Required | +| `unique_key` | A tuple of column names that uniquely identify rows. For more details on uniqueness constraints, see [here](https://docs.getdbt.com/docs/build/incremental-models#defining-a-uniqueness-constraint-optional). | Required. If not provided altered rows will be added twice to the incremental table. | +| `engine` | The table engine to use when creating tables. See list of supported engines below. | Optional (default: `MergeTree()`) | +| `order_by` | A tuple of column names or arbitrary expressions. This allows you to create a small sparse index that helps find data faster. | Optional (default: `tuple()`) | +| `partition_by` | A partition is a logical combination of records in a table by a specified criterion. The partition key can be any expression from the table columns. | Optional | +| `inserts_only` | (Deprecated, see the `append` materialization strategy). If True, incremental updates will be inserted directly to the target incremental table without creating an intermediate table. | Optional (default: `False`) | +| `incremental_strategy` | The strategy to use for incremental materialization. `delete+insert` and `append` are supported. For additional details on strategies, see [here](https://github.com/ClickHouse/dbt-clickhouse#incremental-model-strategies) | Optional (default: 'default') | +| `incremental_predicates` | Incremental predicate clause to be applied to `delete+insert` materializations | Optional | ## Snapshot @@ -184,12 +188,12 @@ dbt snapshots allow a record to be made of changes to a mutable model over time. #### Snapshot Configuration -| Option | Description | Required? | -|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| -| `target_schema` | A ClickHouse's database name where the snapshot table will be created. | Required | -| `unique_key` | A tuple of column names that uniquely identify rows. | Required. If not provided altered rows will be added twice to the incremental table. | -| `strategy` | Defines how dbt knows if a row has changed. More about dbt startegies [here](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots#detecting-row-changes) | Required | -| `updated_at` | If using the timestamp strategy, the timestamp column to compare. | Only if using the timestamp strategy | +| Option | Description | Required? | +|-----------------|---------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------| +| `target_schema` | A ClickHouse's database name where the snapshot table will be created. | Required | +| `unique_key` | A tuple of column names that uniquely identify rows. | Required. If not provided altered rows will be added twice to the incremental table. | +| `strategy` | Defines how dbt knows if a row has changed. More about dbt startegies [here](/docs/build/snapshots#detecting-row-changes) | Required | +| `updated_at` | If using the timestamp strategy, the timestamp column to compare. | Only if using the timestamp strategy | ## Supported Table Engines diff --git a/website/docs/reference/resource-configs/column_types.md b/website/docs/reference/resource-configs/column_types.md index 274166a9aba..8d236171ba2 100644 --- a/website/docs/reference/resource-configs/column_types.md +++ b/website/docs/reference/resource-configs/column_types.md @@ -26,7 +26,7 @@ seeds: -Or (as of v0.21): +Or: diff --git a/website/docs/reference/resource-configs/contract.md b/website/docs/reference/resource-configs/contract.md new file mode 100644 index 00000000000..59cc511890b --- /dev/null +++ b/website/docs/reference/resource-configs/contract.md @@ -0,0 +1,122 @@ +--- +resource_types: [models] +description: "When the contract configuration is enforced, dbt will ensure that your model's returned dataset exactly matches the attributes you have defined in yaml, such as name and data_type, as well as any additional constraints supported by the data platform." +datatype: "{}" +default_value: {contract: false} +id: "contract" +--- + +:::info New functionality +This functionality is new in v1.5. +::: + +## Related documentation +- [What is a model contract?](/docs/collaborate/govern/model-contracts) +- [Defining `columns`](/reference/resource-properties/columns) +- [Defining `constraints`](/reference/resource-properties/constraints) + +# Definition + +When the `contract` configuration is enforced, dbt will ensure that your model's returned dataset exactly matches the attributes you have defined in yaml: +- `name` and `data_type` for every column +- Additional [`constraints`](/reference/resource-properties/constraints), as supported for this materialization and data platform + +This is to ensure that the people querying your model downstream—both inside and outside dbt—have a predictable and consistent set of columns to use in their analyses. Even a subtle change in data type, such as from `boolean` (`true`/`false`) to `integer` (`0`/`1`), could cause queries to fail in surprising ways. + + + +The `data_type` defined in your YAML file must match a data type your data platform recognizes. dbt does not do any type aliasing itself. If your data platform recognizes both `int` and `integer` as corresponding to the same type, then they will return a match. + + + + + +dbt uses built-in type aliasing for the `data_type` defined in your YAML. For example, you can specify `string` in your contract, and on Postgres/Redshift, dbt will convert it to `text`. If dbt doesn't recognize the `data_type` name among its known aliases, it will pass it through as-is. This is enabled by default, but you can opt-out by setting `alias_types` to `false`. + +Example for disabling: + +```yml + +models: + - name: my_model + config: + contract: + enforced: true + alias_types: false # true by default + +``` + + + +When dbt compares data types, it will not compare granular details such as size, precision, or scale. We don't think you should sweat the difference between `varchar(256)` and `varchar(257)`, because it doesn't really affect the experience of downstream queriers. You can accomplish a more-precise assertion by [writing or using a custom test](/guides/best-practices/writing-custom-generic-tests). + +Note that you need to specify a varchar size or numeric scale, otherwise dbt relies on default values. For example, if a `numeric` type defaults to a precision of 38 and a scale of 0, then the numeric column stores 0 digits to the right of the decimal (it only stores whole numbers), which might cause it to fail contract enforcement. To avoid this implicit coercion, specify your `data_type` with a nonzero scale, like `numeric(38, 6)`. dbt Core 1.7 and higher provides a warning if you don't specify precision and scale when providing a numeric data type. + +## Example + + + +```yml +models: + - name: dim_customers + config: + materialized: table + contract: + enforced: true + columns: + - name: customer_id + data_type: int + constraints: + - type: not_null + - name: customer_name + data_type: string + - name: non_integer + data_type: numeric(38,3) +``` + + + +Let's say your model is defined as: + + + +```sql +select + 'abc123' as customer_id, + 'My Best Customer' as customer_name +``` + + + +When you `dbt run` your model, _before_ dbt has materialized it as a table in the database, you will see this error: +```txt +20:53:45 Compilation Error in model dim_customers (models/dim_customers.sql) +20:53:45 This model has an enforced contract that failed. +20:53:45 Please ensure the name, data_type, and number of columns in your contract match the columns in your model's definition. +20:53:45 +20:53:45 | column_name | definition_type | contract_type | mismatch_reason | +20:53:45 | ----------- | --------------- | ------------- | ------------------ | +20:53:45 | customer_id | TEXT | INT | data type mismatch | +20:53:45 +20:53:45 +20:53:45 > in macro assert_columns_equivalent (macros/materializations/models/table/columns_spec_ddl.sql) +``` + +## Support + +At present, model contracts are supported for: +- SQL models (not yet Python) +- Models materialized as `table`, `view`, and `incremental` (with `on_schema_change: append_new_columns`) +- The most popular data platforms — though support and enforcement of different [constraint types](/reference/resource-properties/constraints) vary by platform + +### Incremental models and `on_schema_change` + +Why require that incremental models also set [`on_schema_change`](/docs/build/incremental-models#what-if-the-columns-of-my-incremental-model-change), and why to `append_new_columns`? + +Imagine: +- You add a new column to both the SQL and the YAML spec +- You don't set `on_schema_change`, or you set `on_schema_change: 'ignore'` +- dbt doesn't actually add that new column to the existing table — and the upsert/merge still succeeds, because it does that upsert/merge on the basis of the already-existing "destination" columns only (this is long-established behavior) +- The result is a delta between the yaml-defined contract, and the actual table in the database - which means the contract is now incorrect! + +Why `append_new_columns`, rather than `sync_all_columns`? Because removing existing columns is a breaking change for contracted models! diff --git a/website/docs/reference/resource-configs/database.md b/website/docs/reference/resource-configs/database.md index 41115864850..9c63b0ca457 100644 --- a/website/docs/reference/resource-configs/database.md +++ b/website/docs/reference/resource-configs/database.md @@ -1,6 +1,8 @@ --- +sidebar_label: "database" resource_types: [models, seeds, tests] datatype: string +description: "Read this guide to understand the database configuration in dbt." --- :::caution Heads up! @@ -10,7 +12,7 @@ This is a work in progress document. While this configuration applies to multipl ## Definition -Optionally specify a custom database for a [model](docs/build/models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](snapshots), use the [`target_database` config](target_database)). +Optionally specify a custom database for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a database for a [snapshot](/docs/build/snapshots), use the [`target_database` config](/reference/resource-configs/target_database)). When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` @@ -20,14 +22,8 @@ The standard behavior of dbt is: To learn more about changing the way that dbt generates a relation's `database`, read [Using Custom Databases](/docs/build/custom-databases) - - -* `v0.13.0`: Support for the `database` config is added -* `v0.16.0`: The `generate_database_name` macro was added to control how the `database` config is used by dbt - - - ## Usage + ### Load seeds into the RAW database @@ -41,4 +37,4 @@ seeds: ## Warehouse specific information * BigQuery: `project` and `database` are interchangeable -* Redshift: Cross-database queries are not possible in Redshift. As such, dbt will return a Database Error if you use this configuration. + diff --git a/website/docs/reference/resource-configs/databricks-configs.md b/website/docs/reference/resource-configs/databricks-configs.md new file mode 100644 index 00000000000..e57e1efc04a --- /dev/null +++ b/website/docs/reference/resource-configs/databricks-configs.md @@ -0,0 +1,439 @@ +--- +title: "Databricks configurations" +id: "databricks-configs" +--- + +## Configuring tables + +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-databricks plugin, in addition to the standard [model configs](/reference/model-configs). + + + +| Option | Description | Required? | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | `8` | + + + + + + +| Option | Description | Required? | Model Support | Example | +|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------|---------------|----------------| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | SQL, Python | `delta` | +| location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | SQL, Python | `/mnt/root` | +| partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | SQL, Python | `date_day` | +| liquid_clustered_by | Cluster the created table by the specified columns. Clustering method is based on [Delta's Liquid Clustering feature](https://docs.databricks.com/en/delta/clustering.html). Available since dbt-databricks 1.6.2. | Optional | SQL | `date_day` | +| clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | SQL, Python | `country_code` | +| buckets | The number of buckets to create while clustering | Required if `clustered_by` is specified | SQL, Python | `8` | + + + + +## Incremental models + +dbt-databricks plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of four values: + - **`append`** (default): Insert new records without updating or overwriting any existing data. + - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data. + - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`, updating old records, and inserting new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`replace_where`** (Delta file format only): Match records based on `incremental_predicates`, replacing all records that match the predicates from the existing table with records matching the predicates from the new data. (If no `incremental_predicates` are specified, all new data is inserted, similar to `append`.) + +Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. + +### The `append` strategy + +Following the `append` strategy, dbt will perform an `insert into` statement with all new data. The appeal of this strategy is that it is straightforward and functional across all platforms, file types, connection methods, and Apache Spark versions. However, this strategy _cannot_ update, overwrite, or delete existing data, so it is likely to insert duplicate records for many data sources. + +Specifying `append` as the incremental strategy is optional, since it's the default strategy used when none is specified. + + + + + + +```sql +{{ config( + materialized='incremental', + incremental_strategy='append', +) }} + +-- All rows returned by this query will be appended to the existing table + +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` + + + + + + +```sql +create temporary view databricks_incremental__dbt_tmp as + + select * from analytics.events + + where event_ts >= (select max(event_ts) from {{ this }}) + +; + +insert into table analytics.databricks_incremental + select `date_day`, `users` from databricks_incremental__dbt_tmp +``` + + + + + +### The `insert_overwrite` strategy + +This strategy is most effective when specified alongside a `partition_by` clause in your model config. dbt will run an [atomic `insert overwrite` statement](https://spark.apache.org/docs/3.0.0-preview/sql-ref-syntax-dml-insert-overwrite-table.html) that dynamically replaces all partitions included in your query. Be sure to re-select _all_ of the relevant data for a partition when using this incremental strategy. + +If no `partition_by` is specified, then the `insert_overwrite` strategy will atomically replace all contents of the table, overriding all existing data with only the new records. The column schema of the table remains the same, however. This can be desirable in some limited circumstances, since it minimizes downtime while the table contents are overwritten. The operation is comparable to running `truncate` + `insert` on other databases. For atomic replacement of Delta-formatted tables, use the `table` materialization (which runs `create or replace`) instead. + + + + + + +```sql +{{ config( + materialized='incremental', + partition_by=['date_day'], + file_format='parquet' +) }} + +/* + Every partition returned by this query will be overwritten + when this model runs +*/ + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + date_day, + count(*) as users + +from new_events +group by 1 +``` + + + + + + + +```sql +create temporary view databricks_incremental__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + date_day, + count(*) as users + + from events + group by 1 + +; + +insert overwrite table analytics.databricks_incremental + partition (date_day) + select `date_day`, `users` from databricks_incremental__dbt_tmp +``` + + + + + +### The `merge` strategy + +The `merge` incremental strategy requires: +- `file_format: delta or hudi` +- Databricks Runtime 5.1 and above for delta file format +- Apache Spark for hudi file format + +dbt will run an [atomic `merge` statement](https://docs.databricks.com/spark/latest/spark-sql/language-manual/merge-into.html) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match on the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). + + + + + + +```sql +{{ config( + materialized='incremental', + file_format='delta', # or 'hudi' + unique_key='user_id', + incremental_strategy='merge' +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + + + + + + + +```sql +create temporary view merge_incremental__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + user_id, + max(date_day) as last_seen + + from events + group by 1 + +; + +merge into analytics.merge_incremental as DBT_INTERNAL_DEST + using merge_incremental__dbt_tmp as DBT_INTERNAL_SOURCE + on DBT_INTERNAL_SOURCE.user_id = DBT_INTERNAL_DEST.user_id + when matched then update set * + when not matched then insert * +``` + + + + + + +### The `replace_where` strategy + +The `replace_where` incremental strategy requires: +- `file_format: delta` +- Databricks Runtime 12.0 and above + +dbt will run an [atomic `replace where` statement](https://docs.databricks.com/en/delta/selective-overwrite.html#arbitrary-selective-overwrite-with-replacewhere) which selectively overwrites data matching one or more `incremental_predicates` specified as a string or array. Only rows matching the predicates will be inserted. If no `incremental_predicates` are specified, dbt will perform an atomic insert, as with `append`. + +:::caution + +`replace_where` inserts data into columns in the order provided, rather than by column name. If you reorder columns and the data is compatible with the existing schema, you may silently insert values into an unexpected column. If the incoming data is incompatible with the existing schema, you will instead receive an error. + +::: + + + + + + +```sql +{{ config( + materialized='incremental', + file_format='delta', + incremental_strategy = 'replace_where' + incremental_predicates = 'user_id >= 10000' # Never replace users with ids < 10000 +) }} + +with new_events as ( + + select * from {{ ref('events') }} + + {% if is_incremental() %} + where date_day >= date_add(current_date, -1) + {% endif %} + +) + +select + user_id, + max(date_day) as last_seen + +from events +group by 1 +``` + + + + + + + +```sql +create temporary view replace_where__dbt_tmp as + + with new_events as ( + + select * from analytics.events + + + where date_day >= date_add(current_date, -1) + + + ) + + select + user_id, + max(date_day) as last_seen + + from events + group by 1 + +; + +insert into analytics.replace_where_incremental + replace where user_id >= 10000 + table `replace_where__dbt_tmp` +``` + + + + + + + +## Persisting model descriptions + +Relation-level docs persistence is supported in dbt v0.17.0. For more +information on configuring docs persistence, see [the docs](/reference/resource-configs/persist_docs). + +When the `persist_docs` option is configured appropriately, you'll be able to +see model descriptions in the `Comment` field of `describe [table] extended` +or `show table extended in [database] like '*'`. + + +## Default file format configurations + +To access advanced incremental strategies features, such as +[snapshots](/reference/commands/snapshot) and the `merge` incremental strategy, you will want to +use the Delta or Hudi file format as the default file format when materializing models as tables. + +It's quite convenient to do this by setting a top-level configuration in your +project file: + + + +```yml +models: + +file_format: delta # or hudi + +seeds: + +file_format: delta # or hudi + +snapshots: + +file_format: delta # or hudi +``` + + + + + +## Materialized views and streaming tables + +Starting with version 1.6.0, the dbt-databricks adapter supports [materialized views](https://docs.databricks.com/en/sql/user/materialized-views.html) and [streaming tables](https://docs.databricks.com/en/sql/load-data-streaming-table.html), as alternatives to incremental tables that are powered by [Delta Live Tables](https://docs.databricks.com/en/delta-live-tables/index.html). +See [What are Delta Live Tables?](https://docs.databricks.com/en/delta-live-tables/index.html#what-are-delta-live-tables-datasets) for more information and use cases. +These features are still in preview, and the support in the dbt-databricks adapter should, for now, be considered _experimental_. +In order to adopt these materialization strategies, you will need a workspace that is enabled for Unity Catalog and serverless SQL Warehouses. + + + +```sql +{{ config( + materialized = 'materialized_view' + ) }} +``` + + + +or + + + +```sql +{{ config( + materialized = 'streaming_table' + ) }} +``` + + + +When dbt detects a pre-existing relation of one of these types, it issues a `REFRESH` [command](https://docs.databricks.com/en/sql/language-manual/sql-ref-syntax-ddl-refresh-full.html). + +### Limitations + +As mentioned above, support for these materializations in the Databricks adapter is still limited. +At this time the following configuration options are not available: + +* Specifying a refresh schedule for these materializations +* Specifying `on_configuration_change` settings. + +Additionally, if you change the model definition of your materialized view or streaming table, you will need to drop the materialization in your warehouse directly before running dbt again; otherwise, you will get a refresh error. + +We plan to address these limitations during the 1.7.x timeframe. + diff --git a/website/docs/reference/resource-configs/delimiter.md b/website/docs/reference/resource-configs/delimiter.md new file mode 100644 index 00000000000..58d6ba8344a --- /dev/null +++ b/website/docs/reference/resource-configs/delimiter.md @@ -0,0 +1,126 @@ +--- +resource_types: [seeds] +datatype: +default_value: "," +--- + +## Definition + +You can use this optional seed configuration to customize how you separate values in a [seed](/docs/build/seeds) by providing the one-character string. + +* The delimiter defaults to a comma when not specified. +* Explicitly set the `delimiter` configuration value if you want seed files to use a different delimiter, such as "|" or ";". + +:::info New in 1.7! + +Delimiter is new functionality available beginning with dbt Core v1.7. + +::: + + +## Usage + +Specify a delimiter in your `dbt_project.yml` file to customize the global separator for all seed values: + + + +```yml +seeds: + : + +delimiter: "|" # default project delimiter for seeds will be "|" + : + +delimiter: "," # delimiter for seeds in seed_subdirectory will be "," +``` + + + + +Or use a custom delimiter to override the values for a specific seed: + + + +```yml +version: 2 + +seeds: + - name: + config: + delimiter: "|" +``` + + + +## Examples +For a project with: + +* `name: jaffle_shop` in the `dbt_project.yml` file +* `seed-paths: ["seeds"]` in the `dbt_project.yml` file + +### Use a custom delimiter to override global values + +You can set a default behavior for all seeds with an exception for one seed, `seed_a`, which uses a comma: + + + +```yml +seeds: + jaffle_shop: + +delimiter: "|" # default delimiter for seeds in jaffle_shop project will be "|" + seed_a: + +delimiter: "," # delimiter for seed_a will be "," +``` + + + +Your corresponding seed files would be formatted like this: + + + +```text +col_a|col_b|col_c +1|2|3 +4|5|6 +... +``` + + + + + +```text +name,id +luna,1 +doug,2 +... +``` + + + +Or you can configure custom behavior for one seed. The `country_codes` uses the ";" delimiter: + + + +```yml +version: 2 + +seeds: + - name: country_codes + config: + delimiter: ";" +``` + + + +The `country_codes` seed file would be formatted like this: + + + +```text +country_code;country_name +US;United States +CA;Canada +GB;United Kingdom +... +``` + + diff --git a/website/docs/reference/resource-configs/docs.md b/website/docs/reference/resource-configs/docs.md index c35890e0bf7..0ccd21d7504 100644 --- a/website/docs/reference/resource-configs/docs.md +++ b/website/docs/reference/resource-configs/docs.md @@ -1,5 +1,7 @@ --- +sidebar_label: "docs" resource_types: models +description: "Docs - Read this in-depth guide to learn about configurations in dbt." datatype: "{dictionary}" default_value: {show: true} --- @@ -26,6 +28,7 @@ models: - name: model_name docs: show: true | false + node_color: "black" ``` @@ -111,22 +114,16 @@ macros: ``` - +Also refer to [macro properties](/reference/macro-properties). ## Definition -The docs field can be used to provide documentation-specific configuration to models. The only currently supported docs attribute is shown, which controls whether or not models are shown in the auto-generated documentation website. +The docs field can be used to provide documentation-specific configuration to models. It supports the doc attribute `show`, which controls whether or not models are shown in the auto-generated documentation website. It also supports `node_color` for some node types. **Note:** hidden models will still appear in the dbt DAG visualization but will be identified as "hidden.” - - -* `v0.16.0`: This property was added - - - ## Default The default value for `show` is `true`. @@ -171,7 +168,7 @@ models: ## Custom node colors -The `docs` attribute now supports `node_color` to customize the node color in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. +The `docs` attribute now supports `node_color` to customize the display color of some node types in the DAG within dbt docs. You can define node colors in the files below and apply overrides where needed. `node_color` hiearchy: @@ -180,7 +177,7 @@ The `docs` attribute now supports `node_color` to customize the node color in th ## Examples -Add custom node colors to models within subdirectories based on hex codes or a plain color name. +Add custom `node_colors` to models that support it within subdirectories based on hex codes or a plain color name. ![Example](../../../../website/static/img/node_color_example.png) diff --git a/website/docs/reference/resource-configs/doris-configs.md b/website/docs/reference/resource-configs/doris-configs.md new file mode 100644 index 00000000000..851639992cd --- /dev/null +++ b/website/docs/reference/resource-configs/doris-configs.md @@ -0,0 +1,186 @@ +--- +title: "Doris/SelectDB configurations" +description: "Doris/SelectDB Configurations - Read this in-depth guide to learn about configurations in dbt." +id: "doris-configs" +--- + +## Models + +| Type | Supported? | Details | +|-----------------------------|------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| view materialization | YES | Creates a [view](https://doris.apache.org/docs/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-VIEW/). | +| table materialization | YES | Creates a [table](https://doris.apache.org/docs/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-TABLE/). | +| incremental materialization | YES | Creates a table if it doesn't exist, and then item table model must be '[unique](https://doris.apache.org/docs/data-table/data-model#uniq-model/)'. | + +### View Materialization + +A dbt model can be created as a Doris view and configured using the following syntax: + + + + + + +```yaml +models: + : + +materialized: view +``` + + + + + + + +```jinja +{{ config(materialized = "view") }} +``` + + + + + +### Table Materialization + +A dbt model can be created as a [Doris table](https://doris.apache.org/docs/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-TABLE/) and configured using the following syntax: + + + + + + +```yaml +models: + : + +materialized: table + +duplicate_key: [ , ... ], + +partition_by: [ , ... ], + +partition_type: , + +partition_by_init: [, ... ] + +distributed_by: [ , ... ], + +buckets: int, + +properties: {:,...} +``` + + + + + + + +```jinja +{{ config( + materialized = "table", + duplicate_key = [ "", ... ], + partition_by = [ "", ... ], + partition_type = "", + partition_by_init = ["", ... ] + distributed_by = [ "", ... ], + buckets = "int", + properties = {"":"",...} + ... + ] +) }} +``` + + + + + +#### Table Configuration + +| Option | Description | Required? | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------| +| `materialized` | How the model will be materialized into Doris. Must be `table` to create a table model. | Required | +| `duplicate_key` | The key list of Doris table model :'[duplicate](https://doris.apache.org/docs/data-table/data-model#duplicate-model)'. | Required | +| `partition_by` | The partition key list of Doris. ([Doris partition](https://doris.apache.org/docs/data-table/data-partition)) | Optional | +| `partition_type` | The partition type of Doris. | Optional (default: `RANGE`) | +| `partition_by_init` | The partition rule or some real partitions item. | Optional | +| `distributed_by` | The bucket key list of Doris. ([Doris distribute](https://doris.apache.org/docs/data-table/data-partition#partitioning-and-bucket)) | Required | +| `buckets` | The bucket number in one Doris partition. | Required | +| `properties` | The other configuration of Doris. ([Doris properties](https://doris.apache.org/docs/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-TABLE/?&_highlight=properties)) | Required | + +### Incremental Materialization + +An incremental Doris table, item table model must be 'unique' and is configured using the following syntax: + + + + + + +```yaml +models: + : + +materialized: incremental + +unique_key: [ , ... ], + +partition_by: [ , ... ], + +partition_type: , + +partition_by_init: [, ... ] + +distributed_by: [ , ... ], + +buckets: int, + +properties: {:,...} +``` + + + + + + + +```jinja +{{ config( + materialized = "incremental", + unique_key = [ "", ... ], + partition_by = [ "", ... ], + partition_type = "", + partition_by_init = ["", ... ] + distributed_by = [ "", ... ], + buckets = "int", + properties = {"":"",...} + ... + ] +) }} +``` + + + + + +#### Incremental Table Configuration + +| Option | Description | Required? | +|---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------| +| `materialized` | How the model will be materialized into Doris. Must be `table` to create a table model. | Required | +| `unique_key` | The key list of Doris table model :'[Doris unique](https://doris.apache.org/docs/data-table/data-model#uniq-model)'. | Required | +| `partition_by` | The partition key list of Doris. ([Doris partition](https://doris.apache.org/docs/data-table/data-partition)) | Optional | +| `partition_type` | The partition type of Doris. | Optional (default: `RANGE`) | +| `partition_by_init` | The partition rule or some real partitions item. | Optional | +| `distributed_by` | The bucket key list of Doris. ([Doris distribute](https://doris.apache.org/docs/data-table/data-partition#partitioning-and-bucket)) | Required | +| `buckets` | The bucket number in one Doris partition. | Required | +| `properties` | The other configuration of Doris. ([Doris properties](https://doris.apache.org/docs/sql-manual/sql-reference/Data-Definition-Statements/Create/CREATE-TABLE/?&_highlight=properties)) | Required | + + + diff --git a/website/docs/reference/resource-configs/enabled.md b/website/docs/reference/resource-configs/enabled.md index 5e9667bd04c..52045503088 100644 --- a/website/docs/reference/resource-configs/enabled.md +++ b/website/docs/reference/resource-configs/enabled.md @@ -1,5 +1,6 @@ --- resource_types: all +description: "Enabled - Read this in-depth guide to learn about configurations in dbt." datatype: boolean default_value: true --- @@ -14,6 +15,7 @@ default_value: true { label: 'Sources', value: 'sources', }, { label: 'Metrics', value: 'metrics', }, { label: 'Exposures', value: 'exposures', }, + { label: 'Semantic models', value: 'semantic models', }, ] }> @@ -37,7 +39,7 @@ select ... ```yml models: - [](resource-path): + [](/reference/resource-configs/resource-path): +enabled: true | false ``` @@ -53,7 +55,7 @@ models: ```yml seeds: - [](resource-path): + [](/reference/resource-configs/resource-path): +enabled: true | false ``` @@ -85,7 +87,7 @@ select ... ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +enabled: true | false ``` @@ -127,7 +129,7 @@ select ... ```yml tests: - [](resource-path): + [](/reference/resource-configs/resource-path): +enabled: true | false ``` @@ -142,14 +144,13 @@ tests: ```yaml sources: - [](resource-path): - [+](plus-prefix)enabled: true | false + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)enabled: true | false ``` - @@ -158,18 +159,17 @@ version: 2 sources: - name: [] - [config](resource-properties/config): + [config](/reference/resource-properties/config): enabled: true | false tables: - name: [] - [config](resource-properties/config): + [config](/reference/resource-properties/config): enabled: true | false ``` - @@ -187,8 +187,8 @@ Support for disabling metrics was added in dbt Core v1.3 ```yaml metrics: - [](resource-path): - [+](plus-prefix)enabled: true | false + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)enabled: true | false ``` @@ -201,7 +201,7 @@ version: 2 metrics: - name: [] - [config](resource-properties/config): + [config](/reference/resource-properties/config): enabled: true | false ``` @@ -226,8 +226,8 @@ Support for disabling exposures was added in dbt Core v1.3 ```yaml exposures: - [](resource-path): - [+](plus-prefix)enabled: true | false + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)enabled: true | false ``` @@ -240,7 +240,7 @@ version: 2 exposures: - name: [] - [config](resource-properties/config): + [config](/reference/resource-properties/config): enabled: true | false ``` @@ -251,16 +251,45 @@ exposures: + + + + +Support for disabling semantic models has been added in dbt Core v1.7 + + + + + + + +```yml +semantic_models: + - name: semantic_people + model: ref('people') + config: + enabled: false + +``` + + + +The `enabled` configuration can be nested under the `config` key. + + + + + ## Definition -An optional configuration for disabling models, seeds, snapshots, and tests. +An optional configuration for disabling models, seeds, snapshots, tests, and semantic models. * Default: true When a resource is disabled, dbt will not consider it as part of your project. Note that this can cause compilation errors. -If you instead want to exclude a model from a particular run, consider using the `--exclude` parameter as part of the [model selection syntax](node-selection/syntax) +If you instead want to exclude a model from a particular run, consider using the `--exclude` parameter as part of the [model selection syntax](/reference/node-selection/syntax) If you are disabling models because they are no longer being used, but you want to version control their SQL, consider making them an [analysis](/docs/build/analyses) instead. diff --git a/website/docs/reference/resource-configs/fabric-configs.md b/website/docs/reference/resource-configs/fabric-configs.md new file mode 100644 index 00000000000..ed0f91b9d84 --- /dev/null +++ b/website/docs/reference/resource-configs/fabric-configs.md @@ -0,0 +1,98 @@ +--- +title: "Microsoft Fabric DWH configurations" +id: "fabric-configs" +--- + +## Materializations + +Ephemeral materialization is not supported due to T-SQL not supporting nested CTEs. It may work in some cases when you're working with very simple ephemeral models. + +### Tables + +Tables are default materialization. + + + + + + + +```sql +{{ + config( + materialized='table' + ) +}} + +select * +from ... +``` + + + + + + + + + +```yaml +models: + your_project_name: + materialized: view + staging: + materialized: table +``` + + + + + + + +## Seeds + +By default, `dbt-fabric` will attempt to insert seed files in batches of 400 rows. +If this exceeds Microsoft Fabric Synapse Data Warehouse 2100 parameter limit, the adapter will automatically limit to the highest safe value possible. + +To set a different default seed value, you can set the variable `max_batch_size` in your project configuration. + + + +```yaml +vars: + max_batch_size: 200 # Any integer less than or equal to 2100 will do. +``` + + + +## Snapshots + +Columns in source tables can not have any constraints. +If, for example, any column has a `NOT NULL` constraint, an error will be thrown. + +## Indexes + +Indexes are not supported by Microsoft Fabric Synapse Data Warehouse. Any Indexes provided as a configuration is ignored by the adapter. + +## Grants with auto provisioning + +Grants with auto provisioning is not supported by Microsoft Fabric Synapse Data Warehouse at this time. + +## Permissions + +The AAD identity (user or service principal) must be a Fabric Workspace admin to work on the database level at this time. Fine grain access control will be incorporated in the future. + +## cross-database macros + +Not supported at this time. + +## dbt-utils + +Not supported at this time diff --git a/website/docs/reference/resource-configs/fail_calc.md b/website/docs/reference/resource-configs/fail_calc.md index 8f3c0fe0c1a..6ee8e5f71ce 100644 --- a/website/docs/reference/resource-configs/fail_calc.md +++ b/website/docs/reference/resource-configs/fail_calc.md @@ -3,13 +3,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.20.0`: Introduced `fail_calc` config -* `v0.21.0`: Introduced `config` property for tests - - - Test queries are written to return a set of failing records, ones not matching the expectation or assertion declared by that test: duplicate records, null values, etc. Most often, this is the count of rows returned by the test query: the default value of `fail_calc` is `count(*)`. But it can also be a custom calculation, whether an aggregate calculation or simply the name of a column to be selected from the test query. diff --git a/website/docs/reference/resource-configs/fal-configs.md b/website/docs/reference/resource-configs/fal-configs.md new file mode 100644 index 00000000000..101e74f5f5a --- /dev/null +++ b/website/docs/reference/resource-configs/fal-configs.md @@ -0,0 +1,78 @@ +--- +title: "fal configurations" +id: "fal-configs" +--- + +## Setting the `db_profile` + +The fal profile configuration needs the `db_profile` property set to the profile configuring your database for SQL models. + +fal will wrap around adapter and just handle Python models while letting all the SQL +needs to the underlying database adapter. + +fal will inherit the `threads` configuration from the `db_profile` unless explicitly specified. + +Example: + + + +```yaml +jaffle_shop: + target: dev_with_fal + outputs: + dev_with_fal: + type: fal + db_profile: dev_pg # This points to your main adapter + dev_pg: + type: postgres + ... +``` + + + +## Using `fal_environment` model configuration + +By creating a `fal_project.yml` in the same location as your `dbt_project.yml` and adding environment definitions in there: + + + +```yaml +environments: + - name: clustering + type: conda + packages: + - kmodes==0.12.2 + + - name: predict + type: venv + requirements: + - prophet +``` + + + +You can now reference any of these environments in your dbt Python models: + + + +```py +def model(dbt, fal): + dbt.config({ + "fal_environment": "clustering" + }) + + import pandas as pd + # kmodes is available because of the `fal_environment` being used + from kmodes.kmodes import KModes + + df: pd.DataFrame = dbt.ref("order_detailed") + df_train = df[["size", "is_vegan", "is_vegetarian", "is_keto", "shape"]] + + km_2 = KModes(n_clusters=3, init="Huang") + km_2.fit_predict(df_train) + df["cluster_label"] = km_2.labels_ + + return df +``` + + diff --git a/website/docs/reference/resource-configs/firebolt-configs.md b/website/docs/reference/resource-configs/firebolt-configs.md index 6fba429a2bd..5e0e0bf1d39 100644 --- a/website/docs/reference/resource-configs/firebolt-configs.md +++ b/website/docs/reference/resource-configs/firebolt-configs.md @@ -270,7 +270,7 @@ To install and use `dbt-external-tables` with Firebolt, you must: To use external tables, you must define a table as `external` in your `dbt_project.yml` file. Every external table must contain the fields `url`, `type`, and `object_pattern`. Note that the Firebolt external table specification requires fewer fields than what is specified in the dbt documentation. -In addition to specifying the columns, an external table may specify partitions. Partitions are not columns and they cannot have the same name as columns. To avoid yaml parsing errors, remember to encase string literals (such as the `url` and `object_pattern` values) in single quotation marks. +In addition to specifying the columns, an external table may specify partitions. Partitions are not columns and they cannot have the same name as columns. To avoid YAML parsing errors, remember to encase string literals (such as the `url` and `object_pattern` values) in single quotation marks. #### dbt_project.yml Syntax For an External Table @@ -315,7 +315,7 @@ $ dbt run-operation stage_external_sources --vars "ext_full_refresh: true" ## Incremental models -The [`incremental_strategy` configuration](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#about-incremental_strategy) controls how dbt builds incremental models. Firebolt currently supports the `append` configuration. You can specify `incremental_strategy` in `dbt_project.yml` or within a model file's `config()` block. The `append` configuration is the default. Specifying this configuration is optional. +The [`incremental_strategy` configuration](https://docs.getdbt.com/docs/build/incremental-models#about-incremental_strategy) controls how dbt builds incremental models. Firebolt currently supports the `append` configuration. You can specify `incremental_strategy` in `dbt_project.yml` or within a model file's `config()` block. The `append` configuration is the default. Specifying this configuration is optional. The `append` strategy performs an `INSERT INTO` statement with all the new data based on the model definition. This strategy doesn't update or delete existing rows, so if you do not filter the data to the most recent records only, it is likely that duplicate records will be inserted. diff --git a/website/docs/reference/resource-configs/full_refresh.md b/website/docs/reference/resource-configs/full_refresh.md index abdcc358f3f..f75fe3a583b 100644 --- a/website/docs/reference/resource-configs/full_refresh.md +++ b/website/docs/reference/resource-configs/full_refresh.md @@ -1,5 +1,6 @@ --- resource_types: [models, seeds] +description: "Full_Refresh - Read this in-depth guide to learn about configurations in dbt." datatype: boolean --- @@ -17,7 +18,7 @@ datatype: boolean ```yml models: - [](resource-path): + [](/reference/resource-configs/resource-path): +full_refresh: false ``` @@ -48,7 +49,7 @@ The configured model(s) will not full-refresh when `dbt run --full-refresh` is i ```yml seeds: - [](resource-path): + [](/reference/resource-configs/resource-path): +full_refresh: false ``` @@ -84,7 +85,7 @@ This logic is encoded in the [`should_full_refresh()`](https://github.com/dbt-la ### Seeds - + ## Recommendation Set `full_refresh: false` for models of especially large datasets, which you would _never_ want dbt to fully drop and recreate. diff --git a/website/docs/reference/resource-configs/glue-configs.md b/website/docs/reference/resource-configs/glue-configs.md index 6834c08ff3f..c475f33aad1 100644 --- a/website/docs/reference/resource-configs/glue-configs.md +++ b/website/docs/reference/resource-configs/glue-configs.md @@ -1,5 +1,6 @@ --- title: "AWS Glue configurations" +description: "AWS Glue Configurations - Read this in-depth guide to learn about configurations in dbt." id: "glue-configs" --- @@ -10,7 +11,7 @@ To-do: ## Configuring tables -When materializing a model as `table`, you may include several optional configs that are specific to the dbt-glue plugin, in addition to the [Apache Spark model configuration](spark-configs#configuring-tables). +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-glue plugin, in addition to the [Apache Spark model configuration](/reference/resource-configs/spark-configs#configuring-tables). | Option | Description | Required? | Example | |---------|----------------------------------------------------|-------------------------|--------------------------| @@ -233,8 +234,8 @@ group by 1 ## Persisting model descriptions -Relation-level docs persistence is inherited from dbt-spark, for more details, check [Apache Spark model configuration](spark-configs#persisting-model-descriptions). +Relation-level docs persistence is inherited from dbt-spark, for more details, check [Apache Spark model configuration](/reference/resource-configs/spark-configs#persisting-model-descriptions). ## Always `schema`, never `database` -This section is also inherited from dbt-spark, for more details, check [Apache Spark model configuration](spark-configs#always-schema-never-database). +This section is also inherited from dbt-spark, for more details, check [Apache Spark model configuration](/reference/resource-configs/spark-configs#always-schema-never-database). diff --git a/website/docs/reference/resource-configs/grants.md b/website/docs/reference/resource-configs/grants.md index c5c3b9e3ece..3a65672fa5e 100644 --- a/website/docs/reference/resource-configs/grants.md +++ b/website/docs/reference/resource-configs/grants.md @@ -35,7 +35,7 @@ Grants have two key components: ## Configuring grants -You can configure `grants` in `dbt_project.yml` to apply grants to many resources at once—all models in your project, a package, or a subfolder—and you can also configure `grants` one-by-one for specific resources, in yaml `config:` blocks or right within their `.sql` files. +You can configure `grants` in `dbt_project.yml` to apply grants to many resources at once—all models in your project, a package, or a subfolder—and you can also configure `grants` one-by-one for specific resources, in YAML `config:` blocks or right within their `.sql` files. @@ -83,7 +83,7 @@ seeds: -The `grants` config can also be defined under the `seeds` config block in `dbt_project.yml`. See [configs and properties](configs-and-properties) for details. +The `grants` config can also be defined under the `seeds` config block in `dbt_project.yml`. See [configs and properties](/reference/configs-and-properties) for details. @@ -106,7 +106,7 @@ The `grants` config can also be defined: - under the `snapshots` config block in `dbt_project.yml` - in a `config()` Jinja macro within a snapshot's SQL block -See [configs and properties](configs-and-properties) for details. +See [configs and properties](/reference/configs-and-properties) for details. @@ -121,7 +121,7 @@ For example: ```yml models: - +grants: + +grants: # In this case the + is not optional, you must include it for your project to parse. select: ['user_a', 'user_b'] ``` @@ -211,7 +211,7 @@ We encourage you to read Google's documentation for more context: - [Understanding GCP roles](https://cloud.google.com/iam/docs/understanding-roles) - [How to format grantees](https://cloud.google.com/bigquery/docs/reference/standard-sql/data-control-language#user_list) - + ### BigQuery examples @@ -243,12 +243,13 @@ models: - Databricks automatically enables `grants` on SQL endpoints. For interactive clusters, admins should enable grant functionality using these two setup steps in the Databricks documentation: - [Enable table access control for your workspace](https://docs.databricks.com/administration-guide/access-control/table-acl.html) - [Enable table access control for a cluster](https://docs.databricks.com/security/access-control/table-acls/table-acl.html) +- In order to grant `READ_METADATA` or `USAGE`, use [post-hooks](https://docs.getdbt.com/reference/resource-configs/pre-hook-post-hook)
          -* No special requirements at this time. +* Granting to / revoking from is only fully supported for Redshift users (not groups or roles).
          diff --git a/website/docs/reference/resource-configs/greenplum-configs.md b/website/docs/reference/resource-configs/greenplum-configs.md index b9115553542..cee2ff5d07e 100644 --- a/website/docs/reference/resource-configs/greenplum-configs.md +++ b/website/docs/reference/resource-configs/greenplum-configs.md @@ -1,5 +1,6 @@ --- title: "Greenplum configurations" +description: "Greenplum Configurations - Read this in-depth guide to learn about configurations in dbt." id: "greenplum-configs" --- @@ -130,4 +131,4 @@ To implement partitions into your dbt-model you need to specify the following co }} select * -``` \ No newline at end of file +``` diff --git a/website/docs/reference/resource-configs/group.md b/website/docs/reference/resource-configs/group.md new file mode 100644 index 00000000000..7515d8c5789 --- /dev/null +++ b/website/docs/reference/resource-configs/group.md @@ -0,0 +1,347 @@ +--- +resource_types: [models, seeds, snapshots, tests, analyses, metrics] +id: "group" +--- + +:::info New functionality +This functionality is new in v1.5. +::: + + + + + + +Support for grouping models was added in dbt Core v1.5 + + + + + + + + + +```yml +version: 2 + +models: + - name: model_name + group: finance +``` + + + + + +```yml +models: + [](resource-path): + +group: finance +``` + + + + + + +```sql + +{{ config( + group='finance' +) }} + +select ... + +``` + + + + + + + + + +Support for grouping seeds was added in dbt Core v1.5 + + + + + + + + + +```yml +models: + [](resource-path): + +group: finance +``` + + + + + +```yml +seeds: + - name: [] + group: finance +``` + + + + + + + + + + +Support for grouping snapshots was added in dbt Core v1.5 + + + + + + + + + +```yml +snapshots: + [](resource-path): + +group: finance +``` + + + + + +```sql +{% snapshot [snapshot_name](snapshot_name) %} + +{{ config( + group='finance' +) }} + +select ... + +{% endsnapshot %} +``` + + + + + + + + + + +Support for grouping tests was added in dbt Core v1.5 + + + + + + + + + +```yml +tests: + [](resource-path): + +group: finance +``` + + + + + +```yml +version: 2 + +: + - name: + tests: + - : + config: + group: finance +``` + + + + + +```sql +{% test () %} + +{{ config( + group='finance' +) }} + +select ... + +{% endtest %} +``` + + + + + + +```sql +{{ config( + group='finance' +) }} +``` + + + + + + + + + +```yml +version: 2 + +analyses: + - name: + group: finance +``` + + + + + + + + + + +Support for grouping metrics was added in dbt Core v1.5 + + + + + + + + + +```yaml +metrics: + [](resource-path): + [+](plus-prefix)group: finance +``` + + + + + +```yaml +version: 2 + +metrics: + - name: [] + group: finance + +``` + + + + + + + + + +Support for grouping semantic models has been added in dbt Core v1.7. + + + + + + + +```yml +semantic_models: + - name: model_name + group: finance + +``` + + + + + +```yml +semantic_models: + [](resource-path): + +group: finance +``` + + + +The `group` configuration can be nested under the `config` key. + + + + + + + +## Definition +An optional configuration for grouping models, analysis, snapshots, tests, and metrics. When a resource is grouped, dbt will allow it to reference private models within the same group. + +For more details on reference access between resources in groups, check out [model access](/docs/collaborate/govern/model-access#groups). + +## Examples +### Prevent a 'marketing' group model from referencing a private 'finance' group model +This is useful if you want to prevent other groups from building on top of models that are rapidly changing, experimental, or otherwise internal to a group or team. + + + +```yml +models: + - name: finance_model + access: private + group: finance + - name: marketing_model + group: marketing +``` + + + + +```sql +select * from {{ ref('finance_model') }} +``` + + +```shell +$ dbt run -s marketing_model +... +dbt.exceptions.DbtReferenceError: Parsing Error + Node model.jaffle_shop.marketing_model attempted to reference node model.jaffle_shop.finance_model, + which is not allowed because the referenced node is private to the finance group. +``` + +## Related docs + +* [Model Access](/docs/collaborate/govern/model-access#groups) +* [Defining groups](/docs/build/groups) diff --git a/website/docs/reference/resource-configs/hive-configs.md b/website/docs/reference/resource-configs/hive-configs.md index 1fb498ec8ce..8906b3ae851 100644 --- a/website/docs/reference/resource-configs/hive-configs.md +++ b/website/docs/reference/resource-configs/hive-configs.md @@ -1,11 +1,12 @@ --- title: "Apache Hive configurations" +description: "Apache Hive Configurations - Read this in-depth guide to learn about configurations in dbt." id: "hive-configs" --- ## Configuring tables -When materializing a model as `table`, you may include several optional configs that are specific to the dbt-hive plugin, in addition to the standard [model configs](model-configs). +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-hive plugin, in addition to the standard [model configs](/reference/model-configs). | Option | Description | Required? | Example | |---------|----------------------------------------------------|-------------------------|--------------------------| diff --git a/website/docs/reference/resource-configs/impala-configs.md b/website/docs/reference/resource-configs/impala-configs.md index 5564df115e7..83cbe311a8d 100644 --- a/website/docs/reference/resource-configs/impala-configs.md +++ b/website/docs/reference/resource-configs/impala-configs.md @@ -1,11 +1,12 @@ --- title: "Apache Impala configurations" +description: "Impala Configs - Read this in-depth guide to learn about configurations in dbt." id: "impala-configs" --- ## Configuring tables -When materializing a model as `table`, you may include several optional configs that are specific to the dbt-impala plugin, in addition to the standard [model configs](model-configs). +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-impala plugin, in addition to the standard [model configs](/reference/model-configs). | Option | Description | Required? | Example | |---------|----------------------------------------------------|-------------------------|--------------------------| diff --git a/website/docs/reference/resource-configs/invalidate_hard_deletes.md b/website/docs/reference/resource-configs/invalidate_hard_deletes.md index 7db32709b74..ba5b37c5d71 100644 --- a/website/docs/reference/resource-configs/invalidate_hard_deletes.md +++ b/website/docs/reference/resource-configs/invalidate_hard_deletes.md @@ -1,9 +1,9 @@ --- resource_types: [snapshots] +description: "Invalidate_hard_deletes - Read this in-depth guide to learn about configurations in dbt." datatype: column_name --- -New in v0.19.0 ```jinja2 @@ -22,7 +22,7 @@ datatype: column_name ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +strategy: timestamp +invalidate_hard_deletes: true diff --git a/website/docs/reference/resource-configs/limit.md b/website/docs/reference/resource-configs/limit.md index 348d93bb720..db31d1bd923 100644 --- a/website/docs/reference/resource-configs/limit.md +++ b/website/docs/reference/resource-configs/limit.md @@ -3,14 +3,7 @@ resource_types: [tests] datatype: integer --- - - -* `v0.20.0`: Introduced `limit` config -* `v0.21.0`: Introduced `config` property for tests - - - -Limit the number of failures that will be returned by a test query. We recommend using this config when working with large datasets and [storing failures in the database](store_failures). +Limit the number of failures that will be returned by a test query. We recommend using this config when working with large datasets and [storing failures in the database](/reference/resource-configs/store_failures). -- **v1.2.0:** Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). +Enable the configuration of [clusters](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - -The default [cluster](https://materialize.com/docs/overview/key-concepts/#clusters) that is used to maintain materialized views or indexes can be configured in your [profile](/reference/profiles.yml) using the `cluster` connection parameter. To override the cluster that is used for specific models (or groups of models), use the `cluster` configuration parameter. +The default [cluster](https://materialize.com/docs/overview/key-concepts/#clusters) that is used to maintain materialized views or indexes can be configured in your [profile](/docs/core/connect-data-platform/profiles.yml) using the `cluster` connection parameter. To override the cluster that is used for specific models (or groups of models), use the `cluster` configuration parameter. @@ -44,11 +43,7 @@ Materialize, at its core, is a real-time database that delivers incremental view ### Indexes - - -- **v1.2.0:** Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). - - +Enable additional configuration for [indexes](https://github.com/MaterializeInc/materialize/blob/main/misc/dbt-materialize/CHANGELOG.md#120---2022-08-31). Like in any standard relational database, you can use [indexes](https://materialize.com/docs/overview/key-concepts/#indexes) to optimize query performance in Materialize. Improvements can be significant, reducing response times down to single-digit milliseconds. @@ -84,14 +79,7 @@ select ... ### Tests - - -- **v1.1.1:** Provide support for storing the results of a test query in a materialized view, using the `store_failures` config. - - - -If you set the optional `--store-failures` flag or [`store_failures` config](resource-configs/store_failures), dbt will create a materialized view using the test query. This view is a continuously updating representation of failures. - +If you set the optional `--store-failures` flag or [`store_failures` config](/reference/resource-configs/store_failures), dbt will create a materialized view for each configured test that can keep track of failures over time. By default, test views are created in a schema suffixed with `dbt_test__audit`. To specify a custom suffix, use the `schema` config. ```yaml @@ -101,4 +89,4 @@ tests: +schema: test ``` - \ No newline at end of file + diff --git a/website/docs/reference/resource-configs/materialized.md b/website/docs/reference/resource-configs/materialized.md index d77a01eb468..ebf6ce452e9 100644 --- a/website/docs/reference/resource-configs/materialized.md +++ b/website/docs/reference/resource-configs/materialized.md @@ -2,6 +2,6 @@ These docs are a placeholder for a yet-to-be-written reference section. -Please refer to the [guide on materializations](materializations) for current documentation. +Please refer to the [guide on materializations](/docs/build/materializations) for current documentation. ::: diff --git a/website/docs/reference/resource-configs/meta.md b/website/docs/reference/resource-configs/meta.md index 948982925b0..aeff9ee6226 100644 --- a/website/docs/reference/resource-configs/meta.md +++ b/website/docs/reference/resource-configs/meta.md @@ -4,12 +4,6 @@ datatype: "{}" default_value: {} --- - - -* `v0.21.0`: `meta` is now a config that can be set in `dbt_project.yml` and as a `config` yaml property for some resource types. It is applied hierarchically and merges on a per-key basis. - - - @@ -46,7 +42,7 @@ The `meta` config can also be defined: - under the `models` config block in `dbt_project.yml` - in a `config()` Jinja macro within a model's SQL file -See [configs and properties](configs-and-properties) for details. +See [configs and properties](/reference/configs-and-properties) for details. @@ -59,11 +55,13 @@ version: 2 sources: - name: model_name - meta: {} + config: + meta: {} tables: - name: table_name - meta: {} + config: + meta: {} columns: - name: column_name @@ -95,7 +93,7 @@ seeds: -The `meta` config can also be defined under the `seeds` config block in `dbt_project.yml`. See [configs and properties](configs-and-properties) for details. +The `meta` config can also be defined under the `seeds` config block in `dbt_project.yml`. See [configs and properties](/reference/configs-and-properties) for details. @@ -123,7 +121,13 @@ The `meta` config can also be defined: - under the `snapshots` config block in `dbt_project.yml` - in a `config()` Jinja macro within a snapshot's SQL block -See [configs and properties](configs-and-properties) for details. +See [configs and properties](/reference/configs-and-properties) for details. + + + + + +You can't add YAML `meta` configs for [generic tests](/docs/build/tests#generic-tests). However, you can add `meta` properties to [singular tests](/docs/build/tests#singular-tests) using `config()` at the top of the test file. @@ -146,7 +150,6 @@ macros: arguments: - name: argument_name - meta: {} ``` @@ -171,6 +174,34 @@ exposures: + + + + +Support for grouping semantic models was added in dbt Core v1.7 + + + + + + + +```yml +semantic_models: + - name: semantic_people + model: ref('people') + config: + meta: {} + +``` +The `meta` configuration can be nusted under the `config` key. + + + + + + + ## Definition @@ -178,12 +209,6 @@ The `meta` field can be used to set metadata for a resource. This metadata is co Depending on the resource you're configuring, `meta` may be available within the `config` property, or as a top-level key. (For backwards compatibility, `meta` is always supported as a top-level key, though without the capabilities of config inheritance.) - - -* `v0.16.0`: This property was introduced -* `v0.21.0`: Introduced the `config` property, and gave `meta` the capabilities of a config - - ## Examples ### Designate a model owner @@ -253,3 +278,4 @@ select 1 as id ```
          + diff --git a/website/docs/reference/resource-configs/mssql-configs.md b/website/docs/reference/resource-configs/mssql-configs.md index 502dd0d574b..9a94f794e86 100644 --- a/website/docs/reference/resource-configs/mssql-configs.md +++ b/website/docs/reference/resource-configs/mssql-configs.md @@ -29,7 +29,7 @@ values={[ ```sql {{ config( - as_columnstore='False' + as_columnstore=false ) }} @@ -117,7 +117,7 @@ from ... ## Grants with auto provisioning -dbt 1.2 introduced the capability to grant/revoke access using the `grants` [configuration option](grants). +dbt 1.2 introduced the capability to grant/revoke access using the `grants` [configuration option](/reference/resource-configs/grants). In dbt-sqlserver, you can additionally set `auto_provision_aad_principals` to `true` in your model configuration if you are using Azure Active Directory authentication with an Azure SQL Database or Azure Synapse Dedicated SQL Pool. This will automatically create the Azure Active Directory principal inside your database if it does not exist yet. @@ -135,6 +135,17 @@ models:
          +## Permissions + +The following permissions are required for the user executing dbt: + +* `CREATE SCHEMA` on the database level (or you can create the schema in advance) +* `CREATE TABLE` on the database level (or on the user's own schema if the schema is already created) +* `CREATE VIEW` on the database level (or on the user's own schema if the schema is already created +* `SELECT` on the tables/views being used as dbt sources + +The 3 `CREATE` permissions above are required on the database level if you want to make use of tests or snapshots in dbt. You can work around this by creating the schemas used for testing and snapshots in advance and granting the right roles. + ## cross-database macros The following macros are currently not supported: diff --git a/website/docs/reference/resource-configs/no-configs.md b/website/docs/reference/resource-configs/no-configs.md index a9fe4ec8752..5a4ba4eaaa2 100644 --- a/website/docs/reference/resource-configs/no-configs.md +++ b/website/docs/reference/resource-configs/no-configs.md @@ -8,4 +8,4 @@ If you were guided to this page from a data platform setup article, it most like - Setting up the profile is the only action the end-user needs to take on the data platform, or - The subsequent actions the end-user needs to take are not currently documented -If you'd like to contribute to data platform-specifc configuration information, refer to [Documenting a new adapter](5-documenting-a-new-adapter) \ No newline at end of file +If you'd like to contribute to data platform-specifc configuration information, refer to [Documenting a new adapter](/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter) \ No newline at end of file diff --git a/website/docs/reference/resource-configs/oracle-configs.md b/website/docs/reference/resource-configs/oracle-configs.md new file mode 100644 index 00000000000..a050ba85b49 --- /dev/null +++ b/website/docs/reference/resource-configs/oracle-configs.md @@ -0,0 +1,74 @@ +--- +title: "Oracle configurations" +id: "oracle-configs" +--- + + + +## Use `parallel` hint + +Table materialization supports specifying the number of parallel executions as shown below + +```sql +-- Create a dbt model using 4 parallel executions +{{config(materialized='table', parallel=4}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + +## Use `table_compression_clause` + +Table materialization supports different compression clauses as shown below + +### Advanced Row Compression + +With Advanced compression enabled, Oracle Database maintains compression during all types of data manipulation operations, including conventional DML such as INSERT and UPDATE. +`ROW STORE COMPRESS ADVANCED` is recommended in OLTP systems. + +```sql +-- Advanced Row compression +{{config(materialized='table', table_compression_clause='ROW STORE COMPRESS ADVANCED')}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + +### Hybrid Columnar Compression + +#### Querying + +`COLUMN STORE COMPRESS FOR QUERY` is useful in data ware house environments. Valid values are `HIGH` or `LOW`, with `HIGH` providing a higher compression ratio. The default is `HIGH` + +```sql +{{config(materialized='table', table_compression_clause='COLUMN STORE COMPRESS FOR QUERY LOW')}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + +or + +```sql +{{config(materialized='table', table_compression_clause='COLUMN STORE COMPRESS FOR QUERY HIGH')}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + +#### Archival + +`COLUMN STORE COMPRESS FOR ARCHIVE` supports higher compression ratio than `COLUMN STORE COMPRESS FOR QUERY` and is useful for archival. Valid values are `HIGH` or `LOW` with `HIGH` providing the highest compression ratio. The default is `LOW` + +```sql +{{config(materialized='table', table_compression_clause='COLUMN STORE COMPRESS FOR ARCHIVE LOW')}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + +or + +```sql +{{config(materialized='table', table_compression_clause='COLUMN STORE COMPRESS FOR ARCHIVE HIGH')}} +SELECT c.cust_id, c.cust_first_name, c.cust_last_name +from {{ source('sh_database', 'customers') }} c +``` + + + diff --git a/website/docs/reference/resource-configs/persist_docs.md b/website/docs/reference/resource-configs/persist_docs.md index 9dec5b6cefe..15b1e0bdb40 100644 --- a/website/docs/reference/resource-configs/persist_docs.md +++ b/website/docs/reference/resource-configs/persist_docs.md @@ -1,5 +1,6 @@ --- id: "persist_docs" +description: "Persist_docs - Read this in-depth guide to learn about configurations in dbt." datatype: Dict[Str, Bool] --- @@ -20,7 +21,7 @@ datatype: Dict[Str, Bool] ```yml models: - [](resource-path): + [](/reference/resource-configs/resource-path): +persist_docs: relation: true columns: true @@ -57,7 +58,7 @@ This config is not implemented for sources. ```yml seeds: - [](resource-path): + [](/reference/resource-configs/resource-path): +persist_docs: relation: true columns: true @@ -74,7 +75,7 @@ seeds: ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +persist_docs: relation: true columns: true @@ -106,18 +107,11 @@ select ... ## Definition -Optionally persist [resource descriptions](resource-properties/description) as +Optionally persist [resource descriptions](/reference/resource-properties/description) as column and relation comments in the database. By default, documentation persistence is disabled, but it can be enabled for specific resources or groups of resources as needed. - - - - Support for this config on Redshift, Postgres, and Snowflake is new in 0.17.0 - - Support for column-level docs persistence is new for all databases in 0.17.0 - - - ## Support The `persist_docs` config is supported on the most widely used dbt adapters: @@ -125,7 +119,8 @@ The `persist_docs` config is supported on the most widely used dbt adapters: - Redshift - Snowflake - BigQuery -- Apache Spark & Databricks +- Databricks +- Apache Spark However, some databases limit where and how descriptions can be added to database objects. Those database adapters might not support `persist_docs`, or might offer only partial support. @@ -138,6 +133,7 @@ Some known issues and limitations: - Column-level comments require `file_format: delta` (or another "v2 file format") - Column-level comments aren't supported for models materialized as views ([issue](https://github.com/dbt-labs/dbt-spark/issues/372)) +
          @@ -148,18 +144,6 @@ Some known issues and limitations: - - -- Column names that must be quoted, such as column names containing special characters, will cause runtime errors if column-level `persist_docs` is enabled. This is fixed in v1.2. - - - - - -- Column-level comments aren't supported for models materialized as views - - -
          @@ -168,7 +152,7 @@ Some known issues and limitations: ### Documenting columns and relations -Supply a [description](resource-properties/description) for a model: +Supply a [description](/reference/resource-properties/description) for a model: diff --git a/website/docs/reference/resource-configs/plus-prefix.md b/website/docs/reference/resource-configs/plus-prefix.md index c054ed5a81b..c1adbc0286a 100644 --- a/website/docs/reference/resource-configs/plus-prefix.md +++ b/website/docs/reference/resource-configs/plus-prefix.md @@ -3,9 +3,9 @@ title: Using the + prefix --- -The `+` prefix is a dbt syntax feature, introduced in dbt v0.17.0, which helps disambiguate between [resource paths](resource-path) and configs in `dbt_project.yml` files. +The `+` prefix is a dbt syntax feature, introduced in dbt v0.17.0, which helps disambiguate between [resource paths](/reference/resource-configs/resource-path) and configs in `dbt_project.yml` files. -It is only compatible with `dbt_project.yml` files that use [`config-version](config-version): 2` +It is not compatible with `dbt_project.yml` files that use [`config-version`](/reference/project-configs/config-version) 1. For example: @@ -29,7 +29,7 @@ models: Throughout this documentation, we've tried to be consistent in using the `+` prefix in `dbt_project.yml` files. However, the leading `+` is in fact _only required_ when you need to disambiguate between resource paths and configs, for example when: -- A config accepts a dictionary as its inputs, for example, the [`persist_docs` config](persist_docs). +- A config accepts a dictionary as its inputs, for example, the [`persist_docs` config](/reference/resource-configs/persist_docs). - Or, a config shares a key with part of a resource path, for example, if you had a directory of models named `tags`. @@ -65,6 +65,6 @@ When adding configs in `dbt_project.yml`, it doesn't hurt to use the `+` prefix, -**Note:** This use of the `+` prefix, in `dbt_project.yml`, is distinct from the use of `+` to control config merge behavior (clobber vs. add) in other config settings (specific resource `.yml` and `.sql` files). Currently, the only config which supports `+` for controlling config merge behavior is [`grants`](grants#grant-config-inheritance). +**Note:** This use of the `+` prefix, in `dbt_project.yml`, is distinct from the use of `+` to control config merge behavior (clobber vs. add) in other config settings (specific resource `.yml` and `.sql` files). Currently, the only config which supports `+` for controlling config merge behavior is [`grants`](/reference/resource-configs/grants#grant-config-inheritance). diff --git a/website/docs/reference/resource-configs/postgres-configs.md b/website/docs/reference/resource-configs/postgres-configs.md index d28c073b065..97a695ee12e 100644 --- a/website/docs/reference/resource-configs/postgres-configs.md +++ b/website/docs/reference/resource-configs/postgres-configs.md @@ -1,18 +1,32 @@ --- title: "Postgres configurations" +description: "Postgres Configurations - Read this in-depth guide to learn about configurations in dbt." id: "postgres-configs" --- +## Incremental materialization strategies -## Performance Optimizations +In dbt-postgres, the following incremental materialization strategies are supported: -### Unlogged + + +- `append` (default) +- `delete+insert` + + - + - - **v0.14.1:** Introduced native support for `unlogged` config +- `append` (default) +- `merge` +- `delete+insert` - + + + +## Performance optimizations + +### Unlogged "Unlogged" tables can be considerably faster than ordinary tables, as they are not written to the write-ahead log nor replicated to read replicas. They are also considerably less safe than ordinary tables. See [Postgres docs](https://www.postgresql.org/docs/current/sql-createtable.html#SQL-CREATETABLE-UNLOGGED) for details. @@ -37,15 +51,9 @@ models: ### Indexes -While Postgres works reasonably well for datasets smaller than about 10mm rows, database tuning is sometimes required. It's important to create indexes for columns that are commonly used in joins or where clauses. - - - - - **v0.20.0:** Introduced native support for `indexes` config +While Postgres works reasonably well for datasets smaller than about 10m rows, database tuning is sometimes required. It's important to create indexes for columns that are commonly used in joins or where clauses. - - -Table models, incremental models, seeds, and snapshots may have a list of `indexes` defined. Each Postgres index can have three components: +Table models, incremental models, seeds, snapshots, and materialized views may have a list of `indexes` defined. Each Postgres index can have three components: - `columns` (list, required): one or more columns on which the index is defined - `unique` (boolean, optional): whether the index should be [declared unique](https://www.postgresql.org/docs/9.4/indexes-unique.html) - `type` (string, optional): a supported [index type](https://www.postgresql.org/docs/current/indexes-types.html) (B-tree, Hash, GIN, etc) @@ -95,3 +103,38 @@ models: ``` + + + +## Materialized views + +The Postgres adapter supports [materialized views](https://www.postgresql.org/docs/current/rules-materializedviews.html). +Indexes are the only configuration that is specific to `dbt-postgres`. +The remaining configuration follows the general [materialized view](/docs/build/materializations#materialized-view) configuration. +There are also some limitations that we hope to address in the next version. + +### Monitored configuration changes + +The settings below are monitored for changes applicable to `on_configuration_change`. + +#### Indexes + +Index changes (`CREATE`, `DROP`) can be applied without the need to rebuild the materialized view. +This differs from a table model, where the table needs to be dropped and re-created to update the indexes. +If the `indexes` portion of the `config` block is updated, the changes will be detected and applied +directly to the materialized view in place. + +### Limitations + +#### Changing materialization to and from "materialized_view" + +Swapping an already materialized model to a materialized view, and vice versa, is not supported. +The workaround is to manually drop the existing materialization in the data warehouse prior to calling `dbt run`. +Running with `--full-refresh` flag will not work to drop the existing table or view and create the materialized view (and vice versa). +This would only need to be done once as the existing object would then be a materialized view. + +For example,`my_model`, has already been materialized as a table in the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="materialized_view"`, they will get an error. +The solution is to execute `DROP TABLE my_model` on the data warehouse before trying the model again. + + diff --git a/website/docs/reference/resource-configs/pre-hook-post-hook.md b/website/docs/reference/resource-configs/pre-hook-post-hook.md index d3fb6c9c3e7..297d6975d6f 100644 --- a/website/docs/reference/resource-configs/pre-hook-post-hook.md +++ b/website/docs/reference/resource-configs/pre-hook-post-hook.md @@ -1,5 +1,6 @@ --- title: pre-hook & post-hook +description: "Pre-hook and Post-hook - Read this in-depth guide to learn about configurations in dbt." resource_types: [models, seeds, snapshots] datatype: sql-statement | [sql-statement] --- @@ -15,14 +16,16 @@ datatype: sql-statement | [sql-statement] + + ```yml models: - [](resource-path): - +pre-hook: | [] - +post-hook: | [] + [](/reference/resource-configs/resource-path): + +pre-hook: SQL-statement | [SQL-statement] + +post-hook: SQL-statement | [SQL-statement] ``` @@ -33,8 +36,8 @@ models: ```sql {{ config( - pre_hook="" | [""], - post_hook="" | [""], + pre_hook="SQL-statement" | ["SQL-statement"], + post_hook="SQL-statement" | ["SQL-statement"], ) }} select ... @@ -48,14 +51,16 @@ select ... + + ```yml seeds: - [](resource-path): - +pre-hook: | [] - +post-hook: | [] + [](/reference/resource-configs/resource-path): + +pre-hook: SQL-statement | [SQL-statement] + +post-hook: SQL-statement | [SQL-statement] ``` @@ -65,14 +70,16 @@ seeds: + + ```yml snapshots: - [](resource-path): - +pre-hook: | [] - +post-hook: | [] + [](/reference/resource-configs/resource-path): + +pre-hook: SQL-statement | [SQL-statement] + +post-hook: SQL-statement | [SQL-statement] ``` @@ -83,8 +90,8 @@ snapshots: ```sql {% snapshot snapshot_name %} {{ config( - pre_hook="" | [""], - post_hook="" | [""], + pre_hook="SQL-statement" | ["SQL-statement"], + post_hook="SQL-statement" | ["SQL-statement"], ) }} select ... @@ -102,22 +109,15 @@ select ... ## Definition A SQL statement (or list of SQL statements) to be run before or after a model, seed, or snapshot is built. -Pre- and post-hooks can also call macros that return SQL statements. If your macro depends on values available only at execution time, such as using model configurations or `ref()` calls to other resources as inputs, you will need to [wrap your macro call in an extra set of curly braces](dont-nest-your-curlies#an-exception). +Pre- and post-hooks can also call macros that return SQL statements. If your macro depends on values available only at execution time, such as using model configurations or `ref()` calls to other resources as inputs, you will need to [wrap your macro call in an extra set of curly braces](/docs/building-a-dbt-project/dont-nest-your-curlies#an-exception). ### Why would I use hooks? dbt aims to provide all the boilerplate SQL you need (DDL, DML, and DCL) via out-of-the-box functionality, which you can configure quickly and concisely. In some cases, there may be SQL that you want or need to run, specific to functionality in your data platform, which dbt does not (yet) offer as a built-in feature. In those cases, you can write the exact SQL you need, using dbt's compilation context, and pass it into a `pre-` or `post-` hook to run before or after your model, seed, or snapshot. - - -* `v0.12.2`: The `post_hook` alias for config blocks was introduced. Prior to this, users needed to use the alternative config syntax to apply pre- and post-hooks. - - - - ## Examples - + @@ -143,7 +143,7 @@ See: [Redshift docs on `UNLOAD`](https://docs.aws.amazon.com/redshift/latest/dg/ ```yml -model: +models: jaffle_shop: # this is the project name marts: finance: @@ -160,71 +160,8 @@ See: [Apache Spark docs on `ANALYZE TABLE`](https://spark.apache.org/docs/latest - - -### Grant privileges on a model - - - -```yml - -models: - +post-hook: "grant select on {{ this }} to group reporter" - -``` - - - -### Grant multiple privileges on a model - - - -```yml - -models: - +post-hook: - - "grant select on {{ this }} to group reporter" - - "grant select on {{ this }} to group transformer" - -``` - - - -### Call a macro to grant privileges on a model - - - -```yml - -model: - +post-hook: "{{ grant_select(this) }}" - -``` - - - - -### Grant privileges on a directory of models - - - -```yml - -model: - jaffle_shop: # this is the project name - marts: - marketing: - # this will be applied to all models in marts/marketing/ - +post-hook: "{{ grant_select(this) }}" - -``` - - - - - ### Additional examples -We've compiled some more in-depth examples [here](hooks-operations#additional-examples). +We've compiled some more in-depth examples [here](/docs/build/hooks-operations#additional-examples). ## Usage notes ### Hooks are cumulative @@ -233,7 +170,7 @@ If you define hooks in both your `dbt_project.yml` and in the `config` block of ### Execution ordering If multiple instances of any hooks are defined, dbt will run each hook using the following ordering: 1. Hooks from dependent packages will be run before hooks in the active package. -2. Hooks defined within the model itself will be run before hooks defined in `dbt_project.yml`. +2. Hooks defined within the model itself will be run after hooks defined in `dbt_project.yml`. 3. Hooks within a given context will be run in the order in which they are defined. @@ -253,8 +190,8 @@ To achieve this, you can use one of the following syntaxes. (Note: You should NO ```sql {{ config( - pre_hook=before_begin(""), - post_hook=after_commit("") + pre_hook=before_begin("SQL-statement"), + post_hook=after_commit("SQL-statement") ) }} @@ -271,11 +208,11 @@ select ... {{ config( pre_hook={ - "sql": "", + "sql": "SQL-statement", "transaction": False }, post_hook={ - "sql": "", + "sql": "SQL-statement", "transaction": False } ) @@ -295,10 +232,10 @@ select ... models: +pre-hook: - sql: "" + sql: "SQL-statement" transaction: false +post-hook: - sql: "" + sql: "SQL-statement" transaction: false diff --git a/website/docs/reference/resource-configs/quote_columns.md b/website/docs/reference/resource-configs/quote_columns.md index 5701fe0f11f..660d59b9e99 100644 --- a/website/docs/reference/resource-configs/quote_columns.md +++ b/website/docs/reference/resource-configs/quote_columns.md @@ -1,5 +1,6 @@ --- resource_types: [seeds] +description: "Quote_columns - Read this in-depth guide to learn about configurations in dbt." datatype: boolean default_value: false --- @@ -8,15 +9,8 @@ default_value: false An optional seed configuration, used to determine whether column names in the seed file should be quoted when the is created. * When `True`, dbt will quote the column names defined in the seed file when building a table for the seed, preserving casing. -* (Default) When `False`, dbt will not quote the column names defined in the seed file. - - - -* `v0.15.0`: Introduced in v0.15.0, with a default of False -* `v0.21.0`: Introduced `config` property for seeds -* Future: The default value may change in a future release. If you're using seed files, it is recommended that you set this configuration explicitly to avoid breaking changes in the future. - - +* When `False`, dbt will not quote the column names defined in the seed file. +* When not set, it will vary by adapter whether or not column names are quoted. ## Usage ### Globally quote all seed columns diff --git a/website/docs/reference/resource-configs/redshift-configs.md b/website/docs/reference/resource-configs/redshift-configs.md index 2fa6439d4fd..9bd127a1e1a 100644 --- a/website/docs/reference/resource-configs/redshift-configs.md +++ b/website/docs/reference/resource-configs/redshift-configs.md @@ -1,5 +1,6 @@ --- title: "Redshift configurations" +description: "Redshift Configurations - Read this in-depth guide to learn about configurations in dbt." id: "redshift-configs" --- @@ -9,11 +10,32 @@ To-do: - think about whether some of these should be outside of models ---> -## Performance Optimizations +## Incremental materialization strategies + +In dbt-redshift, the following incremental materialization strategies are supported: + + + +- `append` (default) +- `delete+insert` + + + + + +- `append` (default) +- `merge` +- `delete+insert` + + + +All of these strategies are inherited from dbt-postgres. + +## Performance optimizations ### Using sortkey and distkey -Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect for models set to `view` or `ephemeral` models. +Tables in Amazon Redshift have two powerful optimizations to improve query performance: distkeys and sortkeys. Supplying these values as model-level configurations apply the corresponding settings in the generated `CREATE TABLE` . Note that these settings will have no effect on models set to `view` or `ephemeral` models. - `dist` can have a setting of `all`, `even`, `auto`, or the name of a key. - `sort` accepts a list of sort keys, for example: `['timestamp', 'userid']`. dbt will build the sort key in the same order the fields are supplied. @@ -53,7 +75,7 @@ For more information on distkeys and sortkeys, view Amazon's docs: - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing a Data Distribution Style](https://docs.aws.amazon.com/redshift/latest/dg/t_Distributing_data.html) - [AWS Documentation » Amazon Redshift » Database Developer Guide » Designing Tables » Choosing Sort Keys](https://docs.aws.amazon.com/redshift/latest/dg/t_Sorting_data.html) -## Late Binding Views +## Late binding views Redshift supports views unbound from their dependencies, or [late binding views](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_VIEW.html#late-binding-views). This DDL option "unbinds" a view from the data it selects from. In practice, this means that if upstream views or tables are dropped with a cascade qualifier, the late-binding view does not get dropped as well. @@ -84,3 +106,54 @@ models: ``` + + + +## Materialized views + +The Redshift adapter supports [materialized views](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html). +Redshift-specific configuration includes the typical `dist`, `sort_type`, `sort`, and `backup`. +For materialized views, there is also the `auto_refresh` setting, which allows Redshift to [automatically refresh](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-refresh.html) the materialized view for you. +The remaining configuration follows the general [materialized view](/docs/build/materializations#Materialized-View) configuration. +There are also some limitations that we hope to address in the next version. + +### Monitored configuration changes + +The settings below are monitored for changes applicable to `on_configuration_change`. + +#### Dist + +Changes to `dist` will result in a full refresh of the existing materialized view (applied at the time of the next `dbt run` of the model). Redshift requires a materialized view to be +dropped and recreated to apply a change to the `distkey` or `diststyle`. + +#### Sort type, sort + +Changes to `sort_type` or `sort` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `sortkey` or `sortstyle`. + +#### Backup + +Changes to `backup` will result in a full refresh. Redshift requires a materialized +view to be dropped and recreated to apply a change to the `backup` setting. + +#### Auto refresh + +The `auto_refresh` setting can be updated via an `ALTER` statement. This setting effectively toggles +automatic refreshes on or off. The default setting for this config is off (`False`). If this +is the only configuration change for the materialized view, dbt will choose to apply +an `ALTER` statement instead of issuing a full refresh, + +### Limitations + +#### Changing materialization from "materialized_view" to "table" or "view" + +Swapping a materialized view to a table or view is not supported. +You must manually drop the existing materialized view in the data warehouse prior to calling `dbt run`. +Normally, re-running with the `--full-refresh` flag would resolve this, but not in this case. +This would only need to be done once as the existing object would then be a materialized view. + +For example, assume that a materialized view, `my_mv.sql`, has already been materialized to the underlying data platform via `dbt run`. +If the user changes the model's config to `materialized="table"`, they will get an error. +The workaround is to execute `DROP MATERIALIZED VIEW my_mv CASCADE` on the data warehouse before trying the model again. + + diff --git a/website/docs/reference/resource-configs/resource-path.md b/website/docs/reference/resource-configs/resource-path.md index 561d5e32ac8..258b83dcd57 100644 --- a/website/docs/reference/resource-configs/resource-path.md +++ b/website/docs/reference/resource-configs/resource-path.md @@ -18,7 +18,7 @@ models: -To apply a configuration to all models in _your_ project only, use your [project name](project-configs/name) as the ``: +To apply a configuration to all models in _your_ project only, use your [project name](/reference/project-configs/name) as the ``: diff --git a/website/docs/reference/resource-configs/schema.md b/website/docs/reference/resource-configs/schema.md index 3edb96f2ea6..c976bf6502a 100644 --- a/website/docs/reference/resource-configs/schema.md +++ b/website/docs/reference/resource-configs/schema.md @@ -1,5 +1,7 @@ --- +sidebar_label: "schema" resource_types: [models, seeds, tests] +description: "Schema - Read this in-depth guide to learn about configurations in dbt." datatype: string --- @@ -9,7 +11,7 @@ This is a work in progress document. While this configuration applies to multipl ::: ## Definition -Optionally specify a custom schema for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a schema for a [snapshot](snapshots), use the [`target_schema` config](target_schema)). +Optionally specify a custom schema for a [model](/docs/build/sql-models) or [seed](/docs/build/seeds). (To specify a schema for a [snapshot](/docs/build/snapshots), use the [`target_schema` config](/reference/resource-configs/target_schema)). When dbt creates a relation (/) in a database, it creates it as: `{{ database }}.{{ schema }}.{{ identifier }}`, e.g. `analytics.finance.payments` @@ -60,7 +62,7 @@ seeds: ### Tests -Customize the name of the schema in which tests [configured to store failures](resource-configs/store_failures) will save their results: +Customize the name of the schema in which tests [configured to store failures](/reference/resource-configs/store_failures) will save their results: diff --git a/website/docs/reference/resource-configs/severity.md b/website/docs/reference/resource-configs/severity.md index 436bfda9460..25bab9647d6 100644 --- a/website/docs/reference/resource-configs/severity.md +++ b/website/docs/reference/resource-configs/severity.md @@ -6,15 +6,7 @@ resource_types: [tests] datatype: string --- - - -* `v0.14.0`: Introduced `severity` config -* `v0.20.0`: Introduced `error_if` + `warn_if` configs. Enabled configuration of tests from `dbt_project.yml` -* `v0.21.0`: Introduced `config` property for tests - - - -Tests return a number of failures—most often, this is the count of rows returned by the test query, but it could be a [custom calculation](resource-configs/fail_calc). Generally, if the number of failures is nonzero, the test returns an error. This makes sense, as test queries are designed to return all the rows you _don't_ want: duplicate records, null values, etc. +Tests return a number of failures—most often, this is the count of rows returned by the test query, but it could be a [custom calculation](/reference/resource-configs/fail_calc). Generally, if the number of failures is nonzero, the test returns an error. This makes sense, as test queries are designed to return all the rows you _don't_ want: duplicate records, null values, etc. It's possible to configure tests to return warnings instead of errors, or to make the test status conditional on the number of failures returned. Maybe 1 duplicate record can count as a warning, but 10 duplicate records should count as an error. @@ -26,10 +18,10 @@ The relevant configs are: Conditional expressions can be any comparison logic that is supported by your SQL syntax with an integer number of failures: `> 5`, `= 0`, `between 5 and 10`, and so on. Here's how those play in practice: -- If `severity: error`, dbt will check the `error_if` condition first. If the error condition is met, the test returns an error. If it's not met, dbt will then check the `warn_if` condition. If the warn condition is met, the test warns; if it's not met, the test passes. +- If `severity: error`, dbt will check the `error_if` condition first. If the error condition is met, the test returns an error. If it's not met, dbt will then check the `warn_if` condition (defaulted to `!=0`). If it's not specified or the warn condition is met, the test warns; if it's not met, the test passes. - If `severity: warn`, dbt will skip the `error_if` condition entirely and jump straight to the `warn_if` condition. If the warn condition is met, the test warns; if it's not met, the test passes. -Note that test warn statuses will return errors instead if the [`--warn-error`](global-cli-flags#warnings-as-errors) flag is passed. Unless dbt is told to treat warnings as errors, a test with `warn` severity will never return an error. +Note that test warn statuses will return errors instead if the [`--warn-error`](/reference/global-cli-flags#warnings-as-errors) flag is passed. Unless dbt is told to treat warnings as errors, a test with `warn` severity will never return an error. - - - **v1.1.2:** Added support for for `storage_type`, `indexes`, `primary_key`, `sort_key`, `shard_key`, `unique_table_key`, `charset`, `collation` options for creating SingleStore tables. - - - ## Performance Optimizations [SingleStore Physical Database Schema Design documentation](https://docs.singlestore.com/managed-service/en/create-a-database/physical-database-schema-design/concepts-of-physical-database-schema-design.html) is helpful if you want to use specific options (that are described below) in your dbt project. diff --git a/website/docs/reference/resource-configs/snapshot_name.md b/website/docs/reference/resource-configs/snapshot_name.md index ca2c8beab91..bb4826a116b 100644 --- a/website/docs/reference/resource-configs/snapshot_name.md +++ b/website/docs/reference/resource-configs/snapshot_name.md @@ -1,3 +1,7 @@ +--- +description: "Snapshot-name - Read this in-depth guide to learn about configurations in dbt." +--- + ```jinja2 @@ -11,9 +15,9 @@ ## Description -The name of a snapshot, as defined in the `{% snapshot %}` block header. This name is used when selecting from a snapshot using the [`ref` function](ref) +The name of a snapshot, as defined in the `{% snapshot %}` block header. This name is used when selecting from a snapshot using the [`ref` function](/reference/dbt-jinja-functions/ref) -This name must not conflict with any other snapshot names, or any model names. +This name must not conflict with the name of any other "refable" resource (models, seeds, other snapshots) defined in this project or package. The name does not need to match the file name. As a result, snapshot filenames do not need to be unique. diff --git a/website/docs/reference/resource-configs/snowflake-configs.md b/website/docs/reference/resource-configs/snowflake-configs.md index 140a716a53f..30c7966ec68 100644 --- a/website/docs/reference/resource-configs/snowflake-configs.md +++ b/website/docs/reference/resource-configs/snowflake-configs.md @@ -1,6 +1,7 @@ --- title: "Snowflake configurations" id: "snowflake-configs" +description: "Snowflake Configurations - Read this in-depth guide to learn about configurations in dbt." --- + + +:::note +See [Databricks configuration](#databricks-configs) for the Databricks version of this page. +::: + ## Configuring tables -When materializing a model as `table`, you may include several optional configs that are specific to the dbt-spark plugin, in addition to the standard [model configs](model-configs). +When materializing a model as `table`, you may include several optional configs that are specific to the dbt-spark plugin, in addition to the standard [model configs](/reference/model-configs). | Option | Description | Required? | Example | |---------|------------------------------------------------------------------------------------------------------------------------------------|-------------------------|--------------------------| -| file_format | The file format to use when creating tables (`parquet`, `delta`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `parquet`| +| file_format | The file format to use when creating tables (`parquet`, `delta`, `iceberg`, `hudi`, `csv`, `json`, `text`, `jdbc`, `orc`, `hive` or `libsvm`). | Optional | `parquet`| | location_root | The created table uses the specified directory to store its data. The table alias is appended to it. | Optional | `/mnt/root` | | partition_by | Partition the created table by the specified columns. A directory is created for each partition. | Optional | `date_day` | | clustered_by | Each partition in the created table will be split into a fixed number of buckets by the specified columns. | Optional | `country_code` | @@ -22,20 +29,14 @@ When materializing a model as `table`, you may include several optional configs ## Incremental models - - - - `dbt-spark==0.19.0`: Added the `append` strategy as default for all platforms, file types, and connection methods. - - - dbt seeks to offer useful, intuitive modeling abstractions by means of its built-in configurations and materializations. Because there is so much variance between Apache Spark clusters out in the world—not to mention the powerful features offered to Databricks users by the Delta file format and custom runtime—making sense of all the available options is an undertaking in its own right. -Alternatively, you can use Apache Hudi file format with Apache Spark runtime for building incremental models. +Alternatively, you can use Apache Iceberg or Apache Hudi file format with Apache Spark runtime for building incremental models. For that reason, the dbt-spark plugin leans heavily on the [`incremental_strategy` config](/docs/build/incremental-models#about-incremental_strategy). This config tells the incremental materialization how to build models in runs beyond their first. It can be set to one of three values: - **`append`** (default): Insert new records without updating or overwriting any existing data. - **`insert_overwrite`**: If `partition_by` is specified, overwrite partitions in the with new data. If no `partition_by` is specified, overwrite the entire table with new data. - - **`merge`** (Delta and Hudi file format only): Match records based on a `unique_key`; update old records, insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) + - **`merge`** (Delta, Iceberg and Hudi file format only): Match records based on a `unique_key`; update old records, insert new ones. (If no `unique_key` is specified, all new data is inserted, similar to `append`.) Each of these strategies has its pros and cons, which we'll discuss below. As with any model config, `incremental_strategy` may be specified in `dbt_project.yml` or within a model file's `config()` block. @@ -185,17 +186,10 @@ insert overwrite table analytics.spark_incremental ### The `merge` strategy - - - - `dbt-spark==0.15.3`: Introduced `merge` incremental strategy - - - - **Usage notes:** The `merge` incremental strategy requires: -- `file_format: delta or hudi` +- `file_format: delta, iceberg or hudi` - Databricks Runtime 5.1 and above for delta file format -- Apache Spark for hudi file format +- Apache Spark for Iceberg or Hudi file format dbt will run an [atomic `merge` statement](https://docs.databricks.com/spark/latest/spark-sql/language-manual/merge-into.html) which looks nearly identical to the default merge behavior on Snowflake and BigQuery. If a `unique_key` is specified (recommended), dbt will update old records with values from new records that match on the key column. If a `unique_key` is not specified, dbt will forgo match criteria and simply insert all new records (similar to `append` strategy). @@ -213,7 +207,7 @@ dbt will run an [atomic `merge` statement](https://docs.databricks.com/spark/lat ```sql {{ config( materialized='incremental', - file_format='delta', # or 'hudi' + file_format='delta', # or 'iceberg' or 'hudi' unique_key='user_id', incremental_strategy='merge' ) }} @@ -279,7 +273,7 @@ merge into analytics.merge_incremental as DBT_INTERNAL_DEST ## Persisting model descriptions Relation-level docs persistence is supported in dbt v0.17.0. For more -information on configuring docs persistence, see [the docs](resource-configs/persist_docs). +information on configuring docs persistence, see [the docs](/reference/resource-configs/persist_docs). When the `persist_docs` option is configured appropriately, you'll be able to see model descriptions in the `Comment` field of `describe [table] extended` @@ -287,12 +281,6 @@ or `show table extended in [database] like '*'`. ## Always `schema`, never `database` - - - - `dbt-spark==0.17.0` ended use of `database` in all cases. - - - Apache Spark uses the terms "schema" and "database" interchangeably. dbt understands `database` to exist at a higher level than `schema`. As such, you should _never_ use or set `database` as a node config or in the target profile when running dbt-spark. @@ -303,8 +291,8 @@ use the `schema` config and `generate_schema_name` macro _only_. ## Default file format configurations To access advanced incremental strategies features, such as -[snapshots](snapshots) and the `merge` incremental strategy, you will want to -use the Delta or Hudi file format as the default file format when materializing models as tables. +[snapshots](/docs/build/snapshots) and the `merge` incremental strategy, you will want to +use the Delta, Iceberg or Hudi file format as the default file format when materializing models as tables. It's quite convenient to do this by setting a top-level configuration in your project file: @@ -313,13 +301,13 @@ project file: ```yml models: - +file_format: delta # or hudi + +file_format: delta # or iceberg or hudi seeds: - +file_format: delta # or hudi + +file_format: delta # or iceberg or hudi snapshots: - +file_format: delta # or hudi + +file_format: delta # or iceberg or hudi ``` diff --git a/website/docs/reference/resource-configs/sql_header.md b/website/docs/reference/resource-configs/sql_header.md index e56fd4c0f96..cfa1e751cfc 100644 --- a/website/docs/reference/resource-configs/sql_header.md +++ b/website/docs/reference/resource-configs/sql_header.md @@ -1,5 +1,6 @@ --- resource_types: [models] +description: "Sql_header - Read this in-depth guide to learn about configurations in dbt." datatype: "string" --- @@ -30,10 +31,10 @@ select ... ```yml -[config-version](config-version): 2 +[config-version](/reference/project-configs/config-version): 2 models: - [](resource-path): + [](/reference/resource-configs/resource-path): +sql_header: ``` @@ -72,7 +73,7 @@ select ... ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +sql_header: ``` @@ -90,7 +91,7 @@ An optional configuration to inject SQL above the `create table as` and `create `sql_header`s can be set using the config, or by `call`-ing the `set_sql_header` macro (example below). ## Comparison to pre-hooks -[Pre-hooks](pre-hook-post-hook) also provide an opportunity to execute SQL before model creation, as a _preceding_ query. In comparison, SQL in a `sql_header` is run in the same _query_ as the `create table|view as` statement. +[Pre-hooks](/reference/resource-configs/pre-hook-post-hook) also provide an opportunity to execute SQL before model creation, as a _preceding_ query. In comparison, SQL in a `sql_header` is run in the same _query_ as the `create table|view as` statement. As a result, this makes it more useful for [Snowflake session parameters](https://docs.snowflake.com/en/sql-reference/parameters.html) and [BigQuery Temporary UDFs](https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions#sql-udf-examples). diff --git a/website/docs/reference/resource-configs/store_failures.md b/website/docs/reference/resource-configs/store_failures.md index 9207e333546..6c71cdb9296 100644 --- a/website/docs/reference/resource-configs/store_failures.md +++ b/website/docs/reference/resource-configs/store_failures.md @@ -3,23 +3,20 @@ resource_types: [tests] datatype: boolean --- - - -* `v0.20.0`: Introduced `store_failures` config and functionality -* `v0.21.0`: Introduced `config` property for tests - - - -The configured test(s) will store their failures when `dbt test --store-failures` is invoked. +The configured test(s) will store their failures when `dbt test --store-failures` is invoked. If you set this configuration as `false` but [`store_failures_as`](/reference/resource-configs/store_failures_as) is configured, it will be overriden. ## Description Optionally set a test to always or never store its failures in the database. - If specified as `true` or `false`, the `store_failures` config will take precedence over the presence or absence of the `--store-failures` flag. - If the `store_failures` config is `none` or omitted, the resource will use the value of the `--store-failures` flag. +- When true, `store_failures` save all the record(s) that failed the test only if [limit](/reference/resource-configs/limit) is not set or if there are fewer records than the limit. `store_failures` are saved in a new table with the name of the test. By default, `store_failures` use a schema named `dbt_test__audit`, but, you can configure the schema to a different value. This logic is encoded in the [`should_store_failures()`](https://github.com/dbt-labs/dbt-core/blob/98c015b7754779793e44e056905614296c6e4527/core/dbt/include/global_project/macros/materializations/helpers.sql#L77) macro. + + + ](resource-path): + [](/reference/resource-configs/resource-path): +strategy: timestamp +updated_at: column_name @@ -66,7 +67,7 @@ snapshots: ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +strategy: check +check_cols: [column_name] | all @@ -79,7 +80,7 @@ snapshots: ## Description -The snapshot strategy dbt should use to detect record changes. Read the guide to [snapshots](snapshots#detecting-row-changes) to understand the differences between the two. +The snapshot strategy dbt should use to detect record changes. Read the guide to [snapshots](/docs/build/snapshots#detecting-row-changes) to understand the differences between the two. ## Default This is a **required configuration**. There is no default value. @@ -136,7 +137,7 @@ Behind the scenes, snapshot strategies are implemented as macros, named `snapsho It's possible to implement your own snapshot strategy by adding a macro with the same naming pattern to your project. For example, you might choose to create a strategy which records hard deletes, named `timestamp_with_deletes`. -1. Create a macro named `snapshot_timestamp_with_deletes_strategy`. Use the existing code a guide and adjust as needed. +1. Create a macro named `snapshot_timestamp_with_deletes_strategy`. Use the existing code as a guide and adjust as needed. 2. Use this strategy via the `strategy` configuration: diff --git a/website/docs/reference/resource-configs/tags.md b/website/docs/reference/resource-configs/tags.md index 48b0842a06d..f6c46f8a088 100644 --- a/website/docs/reference/resource-configs/tags.md +++ b/website/docs/reference/resource-configs/tags.md @@ -1,12 +1,9 @@ --- +sidebar_label: "tags" resource_types: all datatype: string | [string] --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure resources in all `.yml` files - - ](resource-path): + [](/reference/resource-configs/resource-path): +tags: | [] snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +tags: | [] seeds: - [](resource-path): + [](/reference/resource-configs/resource-path): +tags: | [] ``` @@ -80,11 +77,11 @@ models: ## Definition Apply a tag (or list of tags) to a resource. -These tags can be used as part of the [resource selection syntax](node-selection/syntax), when running the following commands: +These tags can be used as part of the [resource selection syntax](/reference/node-selection/syntax), when running the following commands: - `dbt run --select tag:my_tag` - `dbt seed --select tag:my_tag` - `dbt snapshot --select tag:my_tag` -- `dbt test --select tag:my_tag` (indirectly runs all tests accociated with the models that are tagged) +- `dbt test --select tag:my_tag` (indirectly runs all tests associated with the models that are tagged) ## Examples ### Use tags to run parts of your project @@ -201,7 +198,7 @@ sources: tables: - name: table_name - tags: [] + tags: ['table_level'] columns: - name: column_name @@ -213,9 +210,10 @@ sources: -In the example above, the `unique` test would be selected by any of the three tags: +In the example above, the `unique` test would be selected by any of these four tags: ```bash $ dbt test --select tag:top_level +$ dbt test --select tag:table_level $ dbt test --select tag:column_level $ dbt test --select tag:test_level ``` diff --git a/website/docs/reference/resource-configs/target_database.md b/website/docs/reference/resource-configs/target_database.md index 6230c1ef6d9..5f65fa79bad 100644 --- a/website/docs/reference/resource-configs/target_database.md +++ b/website/docs/reference/resource-configs/target_database.md @@ -1,5 +1,6 @@ --- resource_types: [snapshots] +description: "Target_database - Read this in-depth guide to learn about configurations in dbt." datatype: string --- @@ -7,7 +8,7 @@ datatype: string ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +target_database: string ``` @@ -26,7 +27,7 @@ snapshots: ## Description -The database that dbt should build a [snapshot](snapshots) into. +The database that dbt should build a [snapshot](/docs/build/snapshots) into. Notes: - The specified database must already exist @@ -40,7 +41,7 @@ Runtime Error ## Default -By default, dbt will use the [target](target) database associated with your profile/connection. +By default, dbt will use the [target](/reference/dbt-jinja-functions/target) database associated with your profile/connection. ## Examples ### Build all snapshots in a database named `snapshots` @@ -56,7 +57,7 @@ snapshots: ### Use a target-aware database -Use the [`{{ target }}` variable](target) to change which database a snapshot table is built in. +Use the [`{{ target }}` variable](/reference/dbt-jinja-functions/target) to change which database a snapshot table is built in. Note: consider whether this use-case is right for you, as downstream `refs` will select from the `dev` version of a snapshot, which can make it hard to validate models that depend on snapshots. diff --git a/website/docs/reference/resource-configs/target_schema.md b/website/docs/reference/resource-configs/target_schema.md index 9d8eeb33b51..041f004e20c 100644 --- a/website/docs/reference/resource-configs/target_schema.md +++ b/website/docs/reference/resource-configs/target_schema.md @@ -1,5 +1,6 @@ --- resource_types: [snapshots] +description: "Target_schema - Read this in-depth guide to learn about configurations in dbt." datatype: string --- @@ -7,7 +8,7 @@ datatype: string ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +target_schema: string ``` @@ -26,7 +27,7 @@ snapshots: ## Description -The schema that dbt should build a [snapshot](snapshots) into. Snapshots build into the same `target_schema`, no matter who is running them. +The schema that dbt should build a [snapshot](/docs/build/snapshots) into. Snapshots build into the same `target_schema`, no matter who is running them. On **BigQuery**, this is analogous to a `dataset`. @@ -34,7 +35,7 @@ On **BigQuery**, this is analogous to a `dataset`. This is a **required** parameter, no default is provided. ## FAQs - + ## Examples ### Build all snapshots in a schema named `snapshots` @@ -50,7 +51,7 @@ snapshots: ### Use a target-aware schema -Use the [`{{ target }}` variable](target) to change which schema a snapshot is built in. +Use the [`{{ target }}` variable](/reference/dbt-jinja-functions/target) to change which schema a snapshot is built in. Note: consider whether this use-case is right for you, as downstream `refs` will select from the `dev` version of a snapshot, which can make it hard to validate models that depend on snapshots (see above [FAQ](#faqs)) diff --git a/website/docs/reference/resource-configs/teradata-configs.md b/website/docs/reference/resource-configs/teradata-configs.md index 033dec1d94a..f0f4f1a6f3e 100644 --- a/website/docs/reference/resource-configs/teradata-configs.md +++ b/website/docs/reference/resource-configs/teradata-configs.md @@ -204,7 +204,7 @@ id: "teradata-configs" :::info Using seeds to load raw data -As explained in [dbt seeds documentation](https://docs.getdbt.com/docs/building-a-dbt-project/seeds), seeds should not be used to load raw data (for example, large CSV exports from a production database). +As explained in [dbt seeds documentation](/docs/build/seeds), seeds should not be used to load raw data (for example, large CSV exports from a production database). Since seeds are version controlled, they are best suited to files that contain business-specific logic, for example a list of country codes or user IDs of employees. diff --git a/website/docs/reference/resource-configs/trino-configs.md b/website/docs/reference/resource-configs/trino-configs.md new file mode 100644 index 00000000000..21df13feac4 --- /dev/null +++ b/website/docs/reference/resource-configs/trino-configs.md @@ -0,0 +1,371 @@ +--- +title: "Starburst/Trino configurations" +id: "trino-configs" +--- + +## Cluster requirements + +The designated cluster must have an attached catalog where objects such as tables and views can be created, renamed, altered, and dropped. Any user connecting to the cluster with dbt must also have these same permissions for the target catalog. + +## Session properties + +With a Starburst Enterprise, Starburst Galaxy, or Trino cluster, you can [set session properties](https://trino.io/docs/current/sql/set-session.html) to modify the current configuration for your user session. + +The standard way to define session properties is with the `session_properties` field of your `profiles.yml`. This ensures that all dbt connections use these settings by default. + +However, to temporaily adjust these session properties for a specific dbt model or group of models, you can use a [dbt hook](/reference/resource-configs/pre-hook-post-hook) to set session properties on a specific dbt model. For example: + +```sql +{{ + config( + pre_hook="set session query_max_run_time='10m'" + ) +}} +``` + +## Connector properties + +You can use Starburst/Trino table properties to configure how you want your data to be represented. + +For details on what's supported for each supported data source, refer to either the [Trino Connectors](https://trino.io/docs/current/connector.html) or [Starburst Catalog](https://docs.starburst.io/starburst-galaxy/catalogs/). + + + +### Hive catalogs + +At target catalog that uses the Hive connector and a metastore service (HMS) is typical when working with Starburst and dbt. The following settings are recommended for working with dbt. The intent is to ensure that dbt can perform the frequently executed `DROP` and `RENAME` statements. + +```java +hive.metastore-cache-ttl=0s +hive.metastore-refresh-interval=5s +``` + +## File format configuration + +When using file-based connectors such as Hive, a user can customize aspects of the connector such as the format that is used as well the type of materialization + +The below configures the table to be materializes as a set of partitioned [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) files. + +```sql +{{ + config( + materialized='table', + properties= { + "format": "'PARQUET'", + "partitioning": "ARRAY['bucket(id, 2)']", + } + ) +}} +``` + +## Seeds and prepared statements + +The [dbt seed](/docs/build/seeds) command makes use of prepared statements in [Starburst](https://docs.starburst.io/latest/sql/prepare.html)/[Trino](https://trino.io/docs/current/sql/prepare.html). + +Prepared statements are templated SQL statements that you can execute repeatedly with high efficiency. The values are sent in a separate field rather than hard coded in the SQL string itself. This is often how application frontends structure their record `INSERT` statements in the OLTP database backend. Because of this, it's common for prepared statements to have as many placeholder variables (parameters) as there are columns in the destination table. + +Most seed files have more than one row, and often thousands of rows. This makes the size of the client request as large as there are parameters. + +### Header line length limit in Python HTTP client + +You might run into an error message about header line limit if your prepared statements have too many parameters. This is because the header line limit in Python's HTTP client is `65536` bytes. + +You can avoid this upper limit by converting the large prepared statement into smaller statements. dbt already does this by batching an entire seed file into groups of rows — one group for a number of rows in the CSV. + +Let's say you have a seed file with 20 columns, 600 rows, and 12,000 parameters. Instead of creating a single prepared statement for this, you can have dbt create four prepared `INSERT` statements with 150 rows and 3,000 parameters. + +There's a drawback to grouping your table rows. When there are many columns (parameters) in a seed file, the batch size needs to be very small. + +For the `dbt-trino` adapter, the macro for batch size is `trino__get_batch_size()` and its default value is `1000`. To change this default behavior, you can add this macro to your dbt project: + + + +```sql +{% macro trino__get_batch_size() %} + {{ return(10000) }} -- Adjust this number as you see fit +{% endmacro %} +``` + + + +Another way to avoid the header line length limit is to set `prepared_statements_enabled` to `true` in your dbt profile; however, this is considered legacy behavior and can be removed in a future release. + +## Materializations +### Table + +The `dbt-trino` adapter supports these modes in `table` materialization, which you can configure with `on_table_exists`: + +- `rename` — Creates an intermediate table, renames the target table to the backup one, and renames the intermediate table to the target one. +- `drop` — Drops and re-creates a table. This overcomes the table rename limitation in AWS Glue. + +The recommended `table` materialization uses `on_table_exists = 'rename'` and is also the default. You can change this default configuration by editing _one_ of these files: +- the SQL file for your model +- the `dbt_project.yml` configuration file + +The following examples configure `table` materialization to be `drop`: + + + +```sql +{{ + config( + materialized = 'table', + on_table_exists = 'drop` + ) +}} +``` + + + + + + +```yaml +models: + path: + materialized: table + +on_table_exists: drop +``` + + +If you use `table` materialization and `on_table_exists = 'rename'` with AWS Glue, you might encounter this error message. You can overcome the table rename limitation by using `drop`: + +```sh +TrinoUserError(type=USER_ERROR, name=NOT_SUPPORTED, message="Table rename is not yet supported by Glue service") +``` + +### View + +The `dbt-trino` adapter supports these security modes in `view` materialization, which you can configure with `view_security`: +- `definer` +- `invoker` + +For more details about security modes in views, see [Security](https://trino.io/docs/current/sql/create-view.html#security) in the Trino docs. + +By default, `view` materialization uses `view_security = 'definer'`. You can change this default configuration by editing _one_ of these files: +- the SQL file for your model +- the `dbt_project.yml` configuration file + +For example, these configure the security mode to `invoker`: + + + +```sql +{{ + config( + materialized = 'view', + view_security = 'invoker' + ) +}} +``` + + + + + +```yaml +models: + path: + materialized: view + +view_security: invoker +``` + + + +### Incremental + +Using an incremental model limits the amount of data that needs to be transformed, which greatly reduces the runtime of your transformations. This improves performance and reduces compute costs. + +```sql +{{ + config( + materialized = 'incremental', + unique_key='', + incremental_strategy='',) +}} +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` + +Use the `+on_schema_change` property to define how dbt-trino should handle column changes. For more details about this property, see [column changes](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#what-if-the-columns-of-my-incremental-model-change). + +If your connector doesn't support views, set the `+views_enabled` property to `false`. + +#### append strategy + +The default incremental strategy is `append`. `append` only adds new records based on the condition specified in the `is_incremental()` conditional block. + +```sql +{{ + config( + materialized = 'incremental') +}} +select * from {{ ref('events') }} +{% if is_incremental() %} + where event_ts > (select max(event_ts) from {{ this }}) +{% endif %} +``` + +#### delete+insert strategy + +With the `delete+insert` incremental strategy, you can instruct dbt to use a two-step incremental approach. First, it deletes the records detected through the configured `is_incremental()` block, then re-inserts them. + +```sql +{{ + config( + materialized = 'incremental', + unique_key='user_id', + incremental_strategy='delete+insert', + ) +}} +select * from {{ ref('users') }} +{% if is_incremental() %} + where updated_ts > (select max(updated_ts) from {{ this }}) +{% endif %} +``` + +#### merge strategy + +With the `merge` incremental strategy, dbt-trino constructs a [Trino MERGE statement](https://trino.io/docs/current/sql/merge.html) to `insert` new records and `update` existing records, based on the `unique_key` property. + +If `unique_key` is not unique, you can use the `delete+insert` strategy instead. + +```sql +{{ + config( + materialized = 'incremental', + unique_key='user_id', + incremental_strategy='merge', + ) +}} +select * from {{ ref('users') }} +{% if is_incremental() %} + where updated_ts > (select max(updated_ts) from {{ this }}) +{% endif %} +``` + +Be aware that there are some Trino connectors that don't support `MERGE` or have limited support. + +#### Incremental overwrite on Hive models + +If there's a [Hive connector](https://trino.io/docs/current/connector/hive.html) accessing your target incremental model, you can simulate an `INSERT OVERWRITE` statement by using the `insert-existing-partitions-behavior` setting on the Hive connector configuration in Trino: + +```ini +.insert-existing-partitions-behavior=OVERWRITE +``` + +Below is an example Hive configuration that sets the `OVERWRITE` functionality for a Hive connector called `minio`: + +```yaml +trino-incremental-hive: + target: dev + outputs: + dev: + type: trino + method: none + user: admin + password: + catalog: minio + schema: tiny + host: localhost + port: 8080 + http_scheme: http + session_properties: + minio.insert_existing_partitions_behavior: OVERWRITE + threads: 1 +``` + +`dbt-trino` overwrites existing partitions in the target model that match the staged data. It appends the remaining partitions to the target model. This functionality works on incremental models that use partitioning. For example: + +```sql +{{ + config( + materialized = 'incremental', + properties={ + "format": "'PARQUET'", + "partitioned_by": "ARRAY['day']", + } + ) +}} +``` + +### Materialized view + +The `dbt-trino` adapter supports [materialized views](https://trino.io/docs/current/sql/create-materialized-view.html) and refreshes them for every subsequent `dbt run` that you execute. For more information, see [REFRESH MATERIALIZED VIEW](https://trino.io/docs/current/sql/refresh-materialized-view.html) in the Trino docs. + +You can also define custom properties for the materialized view through the `properties` config. + +This materialization supports the [full_refresh](https://docs.getdbt.com/reference/resource-configs/full_refresh) config and flag. +Whenever you want to rebuild your materialized view (for example, when changing underlying SQL query) run `dbt run --full-refresh`. + +You can create a materialized view by editing _one_ of these files: +- the SQL file for your model +- the `dbt_project.yml` configuration file + +The following examples create a materialized view in Parquet format: + + + +```sql +{{ + config( + materialized = 'materialized_view', + properties = { + 'format': "'PARQUET'" + }, + ) +}} +``` + + + + + + +```yaml +models: + path: + materialized: materialized_view + properties: + format: "'PARQUET'" +``` + + +## Snapshots + +[Snapshots in dbt](/docs/build/snapshots) depend on the `current_timestamp` macro, which returns a timestamp with millisecond precision (3 digits) by default. There are some connectors for Trino that don't support this timestamp precision (`TIMESTAMP(3) WITH TIME ZONE`), like Iceberg. + +To change timestamp precision, you can define your own [macro](/docs/build/jinja-macros). For example, this defines a new `trino__current_timestamp()` macro with microsecond precision (6 digits): + + + +```sql +{% macro trino__current_timestamp() %} + current_timestamp(6) +{% endmacro %} +``` + + +## Grants + +Use [grants](/reference/resource-configs/grants) to manage access to the datasets you're producing with dbt. You can use grants with [Starburst Enterprise](https://docs.starburst.io/latest/security/biac-overview.html), [Starburst Galaxy](https://docs.starburst.io/starburst-galaxy/security/access-control.html), and Hive ([sql-standard](https://trino.io/docs/current/connector/hive-security.html)). + + +To implement access permissions, define grants as resource configs on each model, seed, and snapshot. Define the default grants that apply to the entire project in your `dbt_project.yml` and define model-specific grants within each model's SQL or YAML file. + + +```yaml +models: + - name: NAME_OF_YOUR_MODEL + config: + grants: + select: ['reporter', 'bi'] +``` + + +## Model contracts + +The `dbt-trino` adapter supports [model contracts](/docs/collaborate/govern/model-contracts). Currently, only [constraints](/reference/resource-properties/constraints) with `type` as `not_null` are supported. +Before using `not_null` constraints in your model, make sure the underlying connector supports `not null`, to avoid running into errors. diff --git a/website/docs/reference/resource-configs/unique_key.md b/website/docs/reference/resource-configs/unique_key.md index 35ba22b9b44..4e2409bb618 100644 --- a/website/docs/reference/resource-configs/unique_key.md +++ b/website/docs/reference/resource-configs/unique_key.md @@ -1,5 +1,6 @@ --- resource_types: [snapshots] +description: "Unique_key - Read this in-depth guide to learn about configurations in dbt." datatype: column_name_or_expression --- @@ -18,7 +19,7 @@ datatype: column_name_or_expression ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +unique_key: column_name_or_expression ``` @@ -59,7 +60,7 @@ You can also write this in yaml. This might be a good idea if multiple snapshots ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +unique_key: id ``` diff --git a/website/docs/reference/resource-configs/updated_at.md b/website/docs/reference/resource-configs/updated_at.md index 5731fd2ef78..896405bf063 100644 --- a/website/docs/reference/resource-configs/updated_at.md +++ b/website/docs/reference/resource-configs/updated_at.md @@ -1,5 +1,6 @@ --- resource_types: [snapshots] +description: "Updated_at - Read this in-depth guide to learn about configurations in dbt." datatype: column_name --- @@ -18,7 +19,7 @@ datatype: column_name ```yml snapshots: - [](resource-path): + [](/reference/resource-configs/resource-path): +strategy: timestamp +updated_at: column_name @@ -29,7 +30,7 @@ snapshots: ## Description A column within the results of your snapshot query that represents when the record row was last updated. -This parameter is **required if using the `timestamp` [strategy](strategy)**. +This parameter is **required if using the `timestamp` [strategy](/reference/resource-configs/strategy)**. ## Default diff --git a/website/docs/reference/resource-configs/upsolver-configs.md b/website/docs/reference/resource-configs/upsolver-configs.md new file mode 100644 index 00000000000..b917ee2cc58 --- /dev/null +++ b/website/docs/reference/resource-configs/upsolver-configs.md @@ -0,0 +1,464 @@ +--- +title: "Upsolver configurations" +id: "upsolver-configs" +description: "Upsolver Configurations - Read this in-depth guide to learn about configurations in dbt." +--- + +## Supported Upsolver SQLake functionality + +| COMMAND | STATE | MATERIALIZED | +| ------ | ------ | ------ | +| SQL compute cluster| not supported | - | +| SQL connections| supported | connection | +| SQL copy job | supported | incremental | +| SQL merge job | supported | incremental | +| SQL insert job | supported | incremental | +| SQL materialized views | supported | materializedview | +| Expectations | supported | incremental | + +## Configs materialization + +| Config | Required | Materialization | Description | Example | +| ------ | --------- | --------------- | ---------- | ------- | +| connection_type | Yes | connection | Connection identifier: S3/GLUE_CATALOG/KINESIS | connection_type='S3' | +| connection_options | Yes | connection | Dictionary of options supported by selected connection | connection_options={ 'aws_role': 'aws_role', 'external_id': 'SAMPLES', 'read_only': True } | +| incremental_strategy | No | incremental | Define one of incremental strategies: merge/copy/insert. Default: copy | incremental_strategy='merge' | +| source | No | incremental | Define source to copy from: S3/KAFKA/KINESIS | source = 'S3' | +| target_type | No | incremental | Define target type REDSHIFT/ELASTICSEARCH/S3/SNOWFLAKE/POSTGRES. Default None for Data lake | target_type='Snowflake' | +| target_prefix | False | incremental | Define PREFIX for ELASTICSEARCH target type | target_prefix = 'orders' | +| target_location | False | incremental | Define LOCATION for S3 target type | target_location = 's3://your-bucket-name/path/to/folder/' | +| schema | Yes/No | incremental | Define target schema. Required if target_type, no table created in a metastore connection | schema = 'target_schema' | +| database | Yes/No | incremental | Define target connection. Required if target_type, no table created in a metastore connection | database = 'target_connection' | +| alias | Yes/No | incremental | Define target table. Required if target_type, no table created in a metastore connection | alias = 'target_table' | +| delete_condition | No | incremental | Records that match the ON condition and a delete condition can be deleted | delete_condition='nettotal > 1000' | +| partition_by | No | incremental | List of dictionaries to define partition_by for target metastore table | partition_by=[{'field':'$field_name'}] | +| primary_key | No | incremental | List of dictionaries to define partition_by for target metastore table | primary_key=[{'field':'customer_email', 'type':'string'}] | +| map_columns_by_name | No | incremental | Maps columns from the SELECT statement to the table. Boolean. Default: False | map_columns_by_name=True | +| sync | No | incremental/materializedview | Boolean option to define if job is synchronized or non-msynchronized. Default: False | sync=True | +| options | No | incremental/materializedview | Dictionary of job options | options={ 'START_FROM': 'BEGINNING', 'ADD_MISSING_COLUMNS': True } | + +## SQL connection + +Connections are used to provide Upsolver with the proper credentials to bring your data into SQLake as well as to write out your transformed data to various services. More details on ["Upsolver SQL connections"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-connections) +As a dbt model connection is a model with materialized='connection' + +```sql +{{ config( + materialized='connection', + connection_type={ 'S3' | 'GLUE_CATALOG' | 'KINESIS' | 'KAFKA'| 'SNOWFLAKE' }, + connection_options={} + ) +}} +``` + +Running this model will compile CREATE CONNECTION(or ALTER CONNECTION if exists) SQL and send it to Upsolver engine. Name of the connection will be name of the model. + +## SQL copy job + +A COPY FROM job allows you to copy your data from a given source into a table created in a metastore connection. This table then serves as your staging table and can be used with SQLake transformation jobs to write to various target locations. More details on ["Upsolver SQL copy-from"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/copy-from) + +As a dbt model copy job is model with materialized='incremental' + +```sql +{{ config( materialized='incremental', + sync=True|False, + source = 'S3'| 'KAFKA' | ... , + options={ + 'option_name': 'option_value' + }, + partition_by=[{}] + ) +}} +SELECT * FROM {{ ref() }} +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake (or ALTER TABLE if exists) and CREATE COPY JOB(or ALTER COPY JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL insert job + +An INSERT job defines a query that pulls in a set of data based on the given SELECT statement and inserts it into the designated target. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL insert"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/insert). + +As a dbt model insert job is model with materialized='incremental' and incremental_strategy='insert' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='insert', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT(DISTINCT orderid::string) ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE INSERT JOB(or ALTER INSERT JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL merge job + +A MERGE job defines a query that pulls in a set of data based on the given SELECT statement and inserts into, replaces, or deletes the data from the designated target based on the job definition. This query is then run periodically based on the RUN_INTERVAL defined within the job. More details on ["Upsolver SQL merge"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/merge). + +As a dbt model merge job is model with materialized='incremental' and incremental_strategy='merge' + +```sql +{{ config( materialized='incremental', + sync=True|False, + map_columns_by_name=True|False, + incremental_strategy='merge', + options={ + 'option_name': 'option_value' + }, + primary_key=[{}] + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +HAVING COUNT ... +``` + +Running this model will compile CREATE TABLE SQL for target type Data lake(or ALTER TABLE if exists) and CREATE MERGE JOB(or ALTER MERGE JOB if exists) SQL and send it to Upsolver engine. Name of the table will be name of the model. Name of the job will be name of the model plus '_job' + +## SQL materialized views + +When transforming your data, you may find that you need data from multiple source tables in order to achieve your desired result. +In such a case, you can create a materialized view from one SQLake table in order to join it with your other table (which in this case is considered the main table). More details on ["Upsolver SQL materialized views"](https://docs.upsolver.com/sqlake/sql-command-reference/sql-jobs/create-job/sql-transformation-jobs/sql-materialized-views). + +As a dbt model materialized views is model with materialized='materializedview'. + +```sql +{{ config( materialized='materializedview', + sync=True|False, + options={'option_name': 'option_value'} + ) +}} +SELECT ... +FROM {{ ref() }} +WHERE ... +GROUP BY ... +``` + +Running this model will compile CREATE MATERIALIZED VIEW SQL(or ALTER MATERIALIZED VIEW if exists) and send it to Upsolver engine. Name of the materializedview will be name of the model. + +## Expectations/constraints + +Data quality conditions can be added to your job to drop a row or trigger a warning when a column violates a predefined condition. + +```sql +WITH EXPECTATION EXPECT +ON VIOLATION WARN +``` + +Expectations can be implemented with dbt constraints +Supported constraints: check and not_null + +```yaml +models: + - name: + # required + config: + contract: + enforced: true + # model-level constraints + constraints: + - type: check + columns: ['', ''] + expression: "column1 <= column2" + name: + - type: not_null + columns: ['column1', 'column2'] + name: + + columns: + - name: + data_type: string + + # column-level constraints + constraints: + - type: not_null + - type: check + expression: "REGEXP_LIKE(, '^[0-9]{4}[a-z]{5}$')" + name: +``` + +## Projects examples + +> projects examples link: [github.com/dbt-upsolver/examples/](https://github.com/Upsolver/dbt-upsolver/tree/main/examples) + +## Connection options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| aws_role | s3 | True | True | 'aws_role': `''` | +| external_id | s3 | True | True | 'external_id': `''` | +| aws_access_key_id | s3 | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | s3 | True | True | 'aws_secret_access_key_id': `''` | +| path_display_filter | s3 | True | True | 'path_display_filter': `''` | +| path_display_filters | s3 | True | True | 'path_display_filters': (`''`, ...) | +| read_only | s3 | True | True | 'read_only': True/False | +| encryption_kms_key | s3 | True | True | 'encryption_kms_key': `''` | +| encryption_customer_managed_key | s3 | True | True | 'encryption_customer_kms_key': `''` | +| comment | s3 | True | True | 'comment': `''` | +| host | kafka | False | False | 'host': `''` | +| hosts | kafka | False | False | 'hosts': (`''`, ...) | +| consumer_properties | kafka | True | True | 'consumer_properties': `''` | +| version | kafka | False | True | 'version': `''` | +| require_static_ip | kafka | True | True | 'require_static_ip': True/False | +| ssl | kafka | True | True | 'ssl': True/False | +| topic_display_filter | kafka | True | True | 'topic_display_filter': `''` | +| topic_display_filters | kafka | True | True | 'topic_display_filter': (`''`, ...) | +| comment | kafka | True | True | 'comment': `''` | +| aws_role | glue_catalog | True | True | 'aws_role': `''` | +| external_id | glue_catalog | True | True | 'external_id': `''` | +| aws_access_key_id | glue_catalog | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | glue_catalog | True | True | 'aws_secret_access_key': `''` | +| default_storage_connection | glue_catalog | False | False | 'default_storage_connection': `''` | +| default_storage_location | glue_catalog | False | False | 'default_storage_location': `''` | +| region | glue_catalog | False | True | 'region': `''` | +| database_display_filter | glue_catalog | True | True | 'database_display_filter': `''` | +| database_display_filters | glue_catalog | True | True | 'database_display_filters': (`''`, ...) | +| comment | glue_catalog | True | True | 'comment': `''` | +| aws_role | kinesis | True | True | 'aws_role': `''` | +| external_id | kinesis | True | True | 'external_id': `''` | +| aws_access_key_id | kinesis | True | True | 'aws_access_key_id': `''` | +| aws_secret_access_key | kinesis | True | True | 'aws_secret_access_key': `''` | +| region | kinesis | False | False | 'region': `''` | +| read_only | kinesis | False | True | 'read_only': True/False | +| max_writers | kinesis | True | True | 'max_writers': `` | +| stream_display_filter | kinesis | True | True | 'stream_display_filter': `''` | +| stream_display_filters | kinesis | True | True | 'stream_display_filters': (`''`, ...) | +| comment | kinesis | True | True | 'comment': `''` | +| connection_string | snowflake | True | False | 'connection_string': `''` | +| user_name | snowflake | True | False | 'user_name': `''` | +| password | snowflake | True | False | 'password': `''` | +| max_concurrent_connections | snowflake | True | True | 'max_concurrent_connections': `` | +| comment | snowflake | True | True | 'comment': `''` | +| connection_string | redshift | True | False | 'connection_string': `''` | +| user_name | redshift | True | False | 'user_name': `''` | +| password | redshift | True | False | 'password': `''` | +| max_concurrent_connections | redshift | True | True | 'max_concurrent_connections': `` | +| comment | redshift | True | True | 'comment': `''` | +| connection_string | mysql | True | False | 'connection_string': `''` | +| user_name | mysql | True | False | 'user_name': `''` | +| password | mysql | True | False | 'password': `''` | +| comment | mysql | True | True | 'comment': `''` | +| connection_string | postgres | True | False | 'connection_string': `''` | +| user_name | postgres | True | False | 'user_name': `''` | +| password | postgres | True | False | 'password': `''` | +| comment | postgres | True | True | 'comment': `''` | +| connection_string | elasticsearch | True | False | 'connection_string': `''` | +| user_name | elasticsearch | True | False | 'user_name': `''` | +| password | elasticsearch | True | False | 'password': `''` | +| comment | elasticsearch | True | True | 'comment': `''` | +| connection_string | mongodb | True | False | 'connection_string': `''` | +| user_name | mongodb | True | False | 'user_name': `''` | +| password | mongodb | True | False | 'password': `''` | +| timeout | mongodb | True | True | 'timeout': "INTERVAL 'N' SECONDS" | +| comment | mongodb | True | True | 'comment': `''` | +| connection_string | mssql | True | False | 'connection_string': `''` | +| user_name | mssql | True | False | 'user_name': `''` | +| password | mssql | True | False | 'password': `''` | +| comment | mssql | True | True | 'comment': `''` | + +## Target options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| globally_unique_keys | datalake | False | True | 'globally_unique_keys': True/False | +| storage_connection | datalake | False | True | 'storage_connection': `''` | +| storage_location | datalake | False | True | 'storage_location': `''` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| compression | datalake | True | True | 'compression': 'SNAPPY/GZIP' | +| compaction_processes | datalake | True | True | 'compaction_processes': `` | +| disable_compaction | datalake | True | True | 'disable_compaction': True/False | +| retention_date_partition | datalake | False | True | 'retention_date_partition': `''` | +| table_data_retention | datalake | True | True | 'table_data_retention': `''` | +| column_data_retention | datalake | True | True | 'column_data_retention': ({'COLUMN' : `''`,'DURATION': `''`}) | +| comment | datalake | True | True | 'comment': `''` | +| storage_connection | materialized_view | False | True | 'storage_connection': `''` | +| storage_location | materialized_view | False | True | 'storage_location': `''` | +| max_time_travel_duration | materialized_view | True | True | 'max_time_travel_duration': `''` | +| compute_cluster | materialized_view | True | True | 'compute_cluster': `''` | +| column_transformations | snowflake | False | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | snowflake | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| exclude_columns | snowflake | False | True | 'exclude_columns': (`''`, ...) | +| create_table_if_missing | snowflake | False | True | 'create_table_if_missing': True/False} | +| run_interval | snowflake | False | True | 'run_interval': `''` | + +## Transformation options + +| Option | Storage | Editable | Optional | Config Syntax | +| -------| --------- | -------- | -------- | ------------- | +| run_interval | s3 | False | True | 'run_interval': `''` | +| start_from | s3 | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | s3 | True | True | 'end_at': `'/NOW'` | +| compute_cluster | s3 | True | True | 'compute_cluster': `''` | +| comment | s3 | True | True | 'comment': `''` | +| skip_validations | s3 | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | s3 | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | s3 | True | True | 'aggregation_parallelism': `` | +| run_parallelism | s3 | True | True | 'run_parallelism': `` | +| file_format | s3 | False | False | 'file_format': '(type = ``)' | +| compression | s3 | False | True | 'compression': 'SNAPPY/GZIP ...' | +| date_pattern | s3 | False | True | 'date_pattern': `''` | +| output_offset | s3 | False | True | 'output_offset': `''` | +| run_interval | elasticsearch | False | True | 'run_interval': `''` | +| routing_field_name | elasticsearch | True | True | 'routing_field_name': `''` | +| start_from | elasticsearch | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | elasticsearch | True | True | 'end_at': `'/NOW'` | +| compute_cluster | elasticsearch | True | True | 'compute_cluster': `''` | +| skip_validations | elasticsearch | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | elasticsearch | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | elasticsearch | True | True | 'aggregation_parallelism': `` | +| run_parallelism | elasticsearch | True | True | 'run_parallelism': `` | +| bulk_max_size_bytes | elasticsearch | True | True | 'bulk_max_size_bytes': `` | +| index_partition_size | elasticsearch | True | True | 'index_partition_size': 'HOURLY/DAILY ...' | +| comment | elasticsearch | True | True | 'comment': `''` | +| custom_insert_expressions | snowflake | True | True | 'custom_insert_expressions': {'INSERT_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| custom_update_expressions | snowflake | True | True | 'custom_update_expressions': {'UPDATE_TIME' : 'CURRENT_TIMESTAMP()','MY_VALUE': `''`} | +| keep_existing_values_when_null | snowflake | True | True | 'keep_existing_values_when_null': True/False | +| add_missing_columns | snowflake | False | True | 'add_missing_columns': True/False | +| run_interval | snowflake | False | True | 'run_interval': `''` | +| commit_interval | snowflake | True | True | 'commit_interval': `''` | +| start_from | snowflake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | snowflake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | snowflake | True | True | 'compute_cluster': `''` | +| skip_validations | snowflake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | snowflake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | snowflake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | snowflake | True | True | 'run_parallelism': `` | +| comment | snowflake | True | True | 'comment': `''` | +| add_missing_columns | datalake | False | True | 'add_missing_columns': True/False | +| run_interval | datalake | False | True | 'run_interval': `''` | +| start_from | datalake | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | datalake | True | True | 'end_at': `'/NOW'` | +| compute_cluster | datalake | True | True | 'compute_cluster': `''` | +| skip_validations | datalake | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | datalake | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | datalake | True | True | 'aggregation_parallelism': `` | +| run_parallelism | datalake | True | True | 'run_parallelism': `` | +| comment | datalake | True | True | 'comment': `''` | +| run_interval | redshift | False | True | 'run_interval': `''` | +| start_from | redshift | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | redshift | True | True | 'end_at': `'/NOW'` | +| compute_cluster | redshift | True | True | 'compute_cluster': `''` | +| skip_validations | redshift | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | redshift | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | redshift | True | True | 'aggregation_parallelism': `` | +| run_parallelism | redshift | True | True | 'run_parallelism': `` | +| skip_failed_files | redshift | False | True | 'skip_failed_files': True/False | +| fail_on_write_error | redshift | False | True | 'fail_on_write_error': True/False | +| comment | redshift | True | True | 'comment': `''` | +| run_interval | postgres | False | True | 'run_interval': `''` | +| start_from | postgres | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | postgres | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | True | True | 'compute_cluster': `''` | +| skip_validations | postgres | False | True | 'skip_validations': ('ALLOW_CARTESIAN_PRODUCT', ...) | +| skip_all_validations | postgres | False | True | 'skip_all_validations': True/False | +| aggregation_parallelism | postgres | True | True | 'aggregation_parallelism': `` | +| run_parallelism | postgres | True | True | 'run_parallelism': `` | +| comment | postgres | True | True | 'comment': `''` | + +## Copy options + +| Option | Storage | Category | Editable | Optional | Config Syntax | +| -------| ---------- | -------- | -------- | -------- | ------------- | +| topic | kafka | source_options | False | False | 'topic': `''` | +| exclude_columns | kafka | job_options | False | True | 'exclude_columns': (`''`, ...) | +| deduplicate_with | kafka | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| consumer_properties | kafka | job_options | True | True | 'consumer_properties': `''` | +| reader_shards | kafka | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kafka | job_options | False | True | 'store_raw_data': True/False | +| start_from | kafka | job_options | False | True | 'start_from': 'BEGINNING/NOW' | +| end_at | kafka | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | kafka | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kafka | job_options | True | True | 'run_parallelism': `` | +| content_type | kafka | job_options | True | True | 'content_type': 'AUTO/CSV/...' | +| compression | kafka | job_options | False | True | 'compression': 'AUTO/GZIP/...' | +| column_transformations | kafka | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | kafka | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kafka | job_options | False | True | 'skip_validations': ('MISSING_TOPIC') | +| skip_all_validations | kafka | job_options | False | True | 'skip_all_validations': True/False | +| comment | kafka | job_options | True | True | 'comment': `''` | +| table_include_list | mysql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mysql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mysql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mysql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mysql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mysql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mysql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mysql | job_options | True | True | 'snapshot_parallelism': `` | +| ddl_filters | mysql | job_options | False | True | 'ddl_filters': (`''`, ...) | +| comment | mysql | job_options | True | True | 'comment': `''` | +| table_include_list | postgres | source_options | False | False | 'table_include_list': (`''`, ...) | +| column_exclude_list | postgres | source_options | False | True | 'column_exclude_list': (`''`, ...) | +| heartbeat_table | postgres | job_options | False | True | 'heartbeat_table': `''` | +| skip_snapshots | postgres | job_options | False | True | 'skip_snapshots': True/False | +| publication_name | postgres | job_options | False | False | 'publication_name': `''` | +| end_at | postgres | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | postgres | job_options | True | True | 'compute_cluster': `''` | +| comment | postgres | job_options | True | True | 'comment': `''` | +| parse_json_columns | postgres | job_options | False | False | 'parse_json_columns': True/False | +| column_transformations | postgres | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| snapshot_parallelism | postgres | job_options | True | True | 'snapshot_parallelism': `` | +| exclude_columns | postgres | job_options | False | True | 'exclude_columns': (`''`, ...) | +| location | s3 | source_options | False | False | 'location': `''` | +| date_pattern | s3 | job_options | False | True | 'date_pattern': `''` | +| file_pattern | s3 | job_options | False | True | 'file_pattern': `''` | +| initial_load_pattern | s3 | job_options | False | True | 'initial_load_pattern': `''` | +| initial_load_prefix | s3 | job_options | False | True | 'initial_load_prefix': `''` | +| delete_files_after_load | s3 | job_options | False | True | 'delete_files_after_load': True/False | +| deduplicate_with | s3 | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| end_at | s3 | job_options | True | True | 'end_at': `'/NOW'` | +| start_from | s3 | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| compute_cluster | s3 | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | s3 | job_options | True | True | 'run_parallelism': `` | +| content_type | s3 | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | s3 | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | s3 | job_options | True | True | 'comment': `''` | +| column_transformations | s3 | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| commit_interval | s3 | job_options | True | True | 'commit_interval': `''` | +| skip_validations | s3 | job_options | False | True | 'skip_validations': ('EMPTY_PATH') | +| skip_all_validations | s3 | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | s3 | job_options | False | True | 'exclude_columns': (`''`, ...) | +| stream | kinesis | source_options | False | False | 'stream': `''` | +| reader_shards | kinesis | job_options | True | True | 'reader_shards': `` | +| store_raw_data | kinesis | job_options | False | True | 'store_raw_data': True/False | +| start_from | kinesis | job_options | False | True | 'start_from': `'/NOW/BEGINNING'` | +| end_at | kinesis | job_options | False | True | 'end_at': `'/NOW'` | +| compute_cluster | kinesis | job_options | True | True | 'compute_cluster': `''` | +| run_parallelism | kinesis | job_options | False | True | 'run_parallelism': `` | +| content_type | kinesis | job_options | True | True | 'content_type': 'AUTO/CSV...' | +| compression | kinesis | job_options | False | True | 'compression': 'AUTO/GZIP...' | +| comment | kinesis | job_options | True | True | 'comment': `''` | +| column_transformations | kinesis | job_options | True | True | 'column_transformations': {`''` : `''` , ...} | +| deduplicate_with | kinesis | job_options | False | True | 'deduplicate_with': {'COLUMNS' : ['col1', 'col2'],'WINDOW': 'N HOURS'} | +| commit_interval | kinesis | job_options | True | True | 'commit_interval': `''` | +| skip_validations | kinesis | job_options | False | True | 'skip_validations': ('MISSING_STREAM') | +| skip_all_validations | kinesis | job_options | False | True | 'skip_all_validations': True/False | +| exclude_columns | kinesis | job_options | False | True | 'exclude_columns': (`''`, ...) | +| table_include_list | mssql | source_options | True | True | 'table_include_list': (`''`, ...) | +| column_exclude_list | mssql | source_options | True | True | 'column_exclude_list': (`''`, ...) | +| exclude_columns | mssql | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mssql | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mssql | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mssql | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mssql | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mssql | job_options | True | True | 'snapshot_parallelism': `` | +| parse_json_columns | mssql | job_options | False | False | 'parse_json_columns': True/False | +| comment | mssql | job_options | True | True | 'comment': `''` | +| collection_include_list | mongodb | source_options | True | True | 'collection_include_list': (`''`, ...) | +| exclude_columns | mongodb | job_options | False | True | 'exclude_columns': (`''`, ...) | +| column_transformations | mongodb | job_options | False | True | 'column_transformations': {`''` : `''` , ...} | +| skip_snapshots | mongodb | job_options | True | True | 'skip_snapshots': True/False | +| end_at | mongodb | job_options | True | True | 'end_at': `'/NOW'` | +| compute_cluster | mongodb | job_options | True | True | 'compute_cluster': `''` | +| snapshot_parallelism | mongodb | job_options | True | True | 'snapshot_parallelism': `` | +| comment | mongodb | job_options | True | True | 'comment': `''` | diff --git a/website/docs/reference/resource-configs/vertica-configs.md b/website/docs/reference/resource-configs/vertica-configs.md new file mode 100644 index 00000000000..598bc3fecee --- /dev/null +++ b/website/docs/reference/resource-configs/vertica-configs.md @@ -0,0 +1,890 @@ +--- +title: "Vertica configurations" +id: "vertica-configs" +--- +## Configuration of Incremental Models + +### Using the on_schema_change config parameter + +You can use `on_schema_change` parameter with values `ignore`, `fail` and `append_new_columns`. Value `sync_all_columns` is not supported at this time. + +#### Configuring the `ignore` (default) parameter + + + + + + + +```sql + + {{config(materialized = 'incremental',on_schema_change='ignore')}} + + select * from {{ ref('seed_added') }} + + +``` + + + + + + + +```sql + + insert into "VMart"."public"."merge" ("id", "name", "some_date") + ( + select "id", "name", "some_date" + from "merge__dbt_tmp" + ) + +``` + + + + + +#### Configuring the `fail` parameter + + + + + + + + +```sql + {{config(materialized = 'incremental',on_schema_change='fail')}} + + + select * from {{ ref('seed_added') }} + + +``` + + + + + + + +```text + + The source and target schemas on this incremental model are out of sync! + They can be reconciled in several ways: + - set the `on_schema_change` config to either append_new_columns or sync_all_columns, depending on your situation. + - Re-run the incremental model with `full_refresh: True` to update the target schema. + - update the schema manually and re-run the process. + + Additional troubleshooting context: + Source columns not in target: {{ schema_changes_dict['source_not_in_target'] }} + Target columns not in source: {{ schema_changes_dict['target_not_in_source'] }} + New column types: {{ schema_changes_dict['new_target_types'] }} +``` + + + + + +#### Configuring the `apppend_new_columns` parameter + + + + + + + + +```sql + + +{{ config( materialized='incremental', on_schema_change='append_new_columns') }} + + + + select * from public.seed_added + + +``` + + + + + + + +```sql + + insert into "VMart"."public"."over" ("id", "name", "some_date", "w", "w1", "t1", "t2", "t3") + ( + select "id", "name", "some_date", "w", "w1", "t1", "t2", "t3" + from "over__dbt_tmp" + ) + + + +``` + + + + +### Using the `incremental_strategy` config ​parameter + +**The `append` strategy (default)**: + +Insert new records without updating or overwriting any existing data. append only adds the new records based on the condition specified in the `is_incremental()` conditional block. + + + + + + + + +```sql + +{{ config( materialized='incremental', incremental_strategy='append' ) }} + + + select * from public.product_dimension + + + {% if is_incremental() %} + + where product_key > (select max(product_key) from {{this }}) + + + {% endif %} +``` + + + + + + + +```sql + + insert into "VMart"."public"."samp" ( + + "product_key", "product_version", "product_description", "sku_number", "category_description", + "department_description", "package_type_description", "package_size", "fat_content", "diet_type", + "weight", "weight_units_of_measure", "shelf_width", "shelf_height", "shelf_depth", "product_price", + "product_cost", "lowest_competitor_price", "highest_competitor_price", "average_competitor_price", "discontinued_flag") + ( + select "product_key", "product_version", "product_description", "sku_number", "category_description", "department_description", "package_type_description", "package_size", "fat_content", "diet_type", "weight", "weight_units_of_measure", "shelf_width", "shelf_height", "shelf_depth", "product_price", "product_cost", "lowest_competitor_price", "highest_competitor_price", "average_competitor_price", "discontinued_flag" + + from "samp__dbt_tmp" + ) + + +``` + + + + + + +**The `merge` strategy**: + +Match records based on a unique_key; update old records, insert new ones. (If no unique_key is specified, all new data is inserted, similar to append.) The unique_key config parameter is required for using the merge strategy, the value accepted by this parameter is a single table column. + + + + + + + + +```sql + + {{ config( materialized = 'incremental', incremental_strategy = 'merge', unique_key='promotion_key' ) }} + + + select * FROM public.promotion_dimension + + +``` + + + + + + +```sql + + + merge into "VMart"."public"."samp" as DBT_INTERNAL_DEST using "samp__dbt_tmp" as DBT_INTERNAL_SOURCE + on DBT_INTERNAL_DEST."promotion_key" = DBT_INTERNAL_SOURCE."promotion_key" + + when matched then update set + "promotion_key" = DBT_INTERNAL_SOURCE."promotion_key", "price_reduction_type" = DBT_INTERNAL_SOURCE."price_reduction_type", "promotion_media_type" = DBT_INTERNAL_SOURCE."promotion_media_type", "display_type" = DBT_INTERNAL_SOURCE."display_type", "coupon_type" = DBT_INTERNAL_SOURCE."coupon_type", "ad_media_name" = DBT_INTERNAL_SOURCE."ad_media_name", "display_provider" = DBT_INTERNAL_SOURCE."display_provider", "promotion_cost" = DBT_INTERNAL_SOURCE."promotion_cost", "promotion_begin_date" = DBT_INTERNAL_SOURCE."promotion_begin_date", "promotion_end_date" = DBT_INTERNAL_SOURCE."promotion_end_date" + + when not matched then insert + ("promotion_key", "price_reduction_type", "promotion_media_type", "display_type", "coupon_type", + "ad_media_name", "display_provider", "promotion_cost", "promotion_begin_date", "promotion_end_date") + values + ( + DBT_INTERNAL_SOURCE."promotion_key", DBT_INTERNAL_SOURCE."price_reduction_type", DBT_INTERNAL_SOURCE."promotion_media_type", DBT_INTERNAL_SOURCE."display_type", DBT_INTERNAL_SOURCE."coupon_type", DBT_INTERNAL_SOURCE."ad_media_name", DBT_INTERNAL_SOURCE."display_provider", DBT_INTERNAL_SOURCE."promotion_cost", DBT_INTERNAL_SOURCE."promotion_begin_date", DBT_INTERNAL_SOURCE."promotion_end_date" + ) + + +``` + + + + + + +###### Using the `merge_update_columns` config parameter + +The `merge_update_columns` config parameter is passed to only update the columns specified and it accepts a list of table columns. + + + + + + + + + + +```sql + + {{ config( materialized = 'incremental', incremental_strategy='merge', unique_key = 'id', merge_update_columns = ["names", "salary"] )}} + + select * from {{ref('seed_tc1')}} + +``` + + + + + + +```sql + merge into "VMart"."public"."test_merge" as DBT_INTERNAL_DEST using "test_merge__dbt_tmp" as DBT_INTERNAL_SOURCE on DBT_INTERNAL_DEST."id" = DBT_INTERNAL_SOURCE."id" + + when matched then update set + "names" = DBT_INTERNAL_SOURCE."names", "salary" = DBT_INTERNAL_SOURCE."salary" + + when not matched then insert + ("id", "names", "salary") + values + ( + DBT_INTERNAL_SOURCE."id", DBT_INTERNAL_SOURCE."names", DBT_INTERNAL_SOURCE."salary" + ) +``` + + + + + +**`delete+insert` strategy**: + +Through the `delete+insert` incremental strategy, you can instruct dbt to use a two-step incremental approach. It will first delete the records detected through the configured `is_incremental()` block and then re-insert them. The `unique_key` is a required parameter for using `delete+instert` strategy which specifies how to update the records when there is duplicate data. The value accepted by this parameter is a single table column. + + + + + + + + + + + +```sql + + {{ config( materialized = 'incremental', incremental_strategy = 'delete+insert', unique_key='date_key' ) }} + + + select * FROM public.date_dimension + +``` + + + + + + +```sql + delete from "VMart"."public"."samp" + where ( + date_key) in ( + select (date_key) + from "samp__dbt_tmp" + ); + + insert into "VMart"."public"."samp" ( + "date_key", "date", "full_date_description", "day_of_week", "day_number_in_calendar_month", "day_number_in_calendar_year", "day_number_in_fiscal_month", "day_number_in_fiscal_year", "last_day_in_week_indicator", "last_day_in_month_indicator", "calendar_week_number_in_year", "calendar_month_name", "calendar_month_number_in_year", "calendar_year_month", "calendar_quarter", "calendar_year_quarter", "calendar_half_year", "calendar_year", "holiday_indicator", "weekday_indicator", "selling_season") + ( + select "date_key", "date", "full_date_description", "day_of_week", "day_number_in_calendar_month", "day_number_in_calendar_year", "day_number_in_fiscal_month", "day_number_in_fiscal_year", "last_day_in_week_indicator", "last_day_in_month_indicator", "calendar_week_number_in_year", "calendar_month_name", "calendar_month_number_in_year", "calendar_year_month", "calendar_quarter", "calendar_year_quarter", "calendar_half_year", "calendar_year", "holiday_indicator", "weekday_indicator", "selling_season" + from "samp__dbt_tmp" + ); + + ``` + + + + +**`insert_overwrite` strategy**: + +The `insert_overwrite` strategy does not use a full-table scan to delete records. Instead of deleting records it drops entire partitions. This strategy may accept `partition_by_string` and `partitions` parameters. You provide these parameters when you want to overwrite a part of the table. + +`partition_by_string` accepts an expression based on which partitioning of the table takes place. This is the PARTITION BY clause in Vertica. + +`partitions` accepts a list of values in the partition column. + +The config parameter `partitions` must be used carefully. Two situations to consider: +- Fewer partitions in the `partitions` parameter than in the where clause: destination table ends up with duplicates. +- More partitions in the `partitions` parameter than in the where clause: destination table ends up missing rows. Less rows in destination than in source. + +To understand more about PARTITION BY clause check [here](https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/Statements/partition-clause.htm) + +:::info Note: + +The `partitions` parameter is optional, if the `partitions` parameter is not provided, the partitions in the where clause will be dropped from destination and inserted back from source. If you use a where clause, you might not need the `partitions` parameter. + +The where clause condition is also optional, but if not provided then all data in source is inserted in destination. + +If no where clause condition and no `partitions` parameter are provided, then it drops all partitions from the table and inserts all of them again. + +If the `partitions` parameter is provided but not where clause is provided, the destination table ends up with duplicates because the partitions in the `partitions` parameter are dropped but all data in the source table (no where clause) is inserted in destination. + +The `partition_by_string` config parameter is also optional. If no `partition_by_string` parameter is provided, then it behaves like `delete+insert`. It deletes all records from destination and then it inserts all records from source. It won’t use or drop partitions. + +If both the `partition_by_string` and `partitions` parameters are not provided then `insert_overwrite` strategy truncates the target table and insert the source table data into target. + +If you want to use `partitions` parameter then you have to partition the table by passing `partition_by_string` parameter. + +::: + + + + + + + + + +```sql +{{config(materialized = 'incremental',incremental_strategy = 'insert_overwrite',partition_by_string='YEAR(cc_open_date)',partitions=['2023'])}} + + + select * from online_sales.call_center_dimension +``` + + + + + + + +```sql + + select PARTITION_TABLE('online_sales.update_call_center_dimension'); + + SELECT DROP_PARTITIONS('online_sales.update_call_center_dimension', '2023', '2023'); + + SELECT PURGE_PARTITION('online_sales.update_call_center_dimension', '2023'); + + insert into "VMart"."online_sales"."update_call_center_dimension" + + ("call_center_key", "cc_closed_date", "cc_open_date", "cc_name", "cc_class", "cc_employees", + + "cc_hours", "cc_manager", "cc_address", "cc_city", "cc_state", "cc_region") + + ( + + select "call_center_key", "cc_closed_date", "cc_open_date", "cc_name", "cc_class", "cc_employees", + + "cc_hours", "cc_manager", "cc_address", "cc_city", "cc_state", "cc_region" + + from "update_call_center_dimension__dbt_tmp" + ); + + + ``` + + + + + + +## Optimization options for table materialization + +There are multiple optimizations that can be used when materializing models as tables. Each config parameter applies a Vertica specific clause in the generated `CREATE TABLE` DDL. + +For more information see [Vertica](https://www.vertica.com/docs/12.0.x/HTML/Content/Authoring/SQLReferenceManual/Statements/CREATETABLE.htm) options for table optimization. + +You can configure these optimizations in your model SQL file as described in the examples below: + + ### Configuring the `ORDER BY` clause + + To leverage the `ORDER BY` clause of the `CREATE TABLE` statement use the `order_by` config param in your model. + + #### Using the `order_by` config parameter + + + + + + + +```sql + {{ config( materialized='table', order_by='product_key') }} + + select * from public.product_dimension + + +``` + + + + + + +```sql + + create table "VMart"."public"."order_s__dbt_tmp" as + + ( select * from public.product_dimension) + + order by product_key; + + ``` + + + + +### Configuring the `SEGMENTED BY` clause + +To leverage the `SEGMENTED BY` clause of the `CREATE TABLE` statement, use the `segmented_by_string` or `segmented_by_all_nodes` config parameters in your model. By default ALL NODES are used to segment tables, so the ALL NODES clause in the SQL statement will be added when using `segmented_by_string` config parameter. You can disable ALL NODES using `no_segmentation` parameter. + +To learn more about segmented by clause check [here](https://www.vertica.com/docs/12.0.x/HTML/Content/Authoring/SQLReferenceManual/Statements/hash-segmentation-clause.htm). + + +#### Using the `segmented_by_string` config parameter + +`segmented_by_string` config parameter can be used to segment projection data using a SQL expression like hash segmentation. + + + + + + + + + + +```sql + + {{ config( materialized='table', segmented_by_string='product_key' ) }} + + + select * from public.product_dimension + +``` + + + + + + +```sql + create table + + "VMart"."public"."segmented_by__dbt_tmp" + + as (select * from public.product_dimension) + + segmented by product_key ALL NODES; + + ``` + + + + + +#### Using the `segmented_by_all_nodes` config parameter + +`segmented_by_all_nodes` config parameter can be used to segment projection data for distribution across all cluster nodes. + +:::info Note: + + If you want to pass `segmented_by_all_nodes` parameter then you have to segment the table by passing `segmented_by_string` parameter. + +::: + + + + + + + + +```sql + {{ config( materialized='table', segmented_by_string='product_key' ,segmented_by_all_nodes='True' ) }} + + select * from public.product_dimension + + +``` + + + + + + +```sql + + create table "VMart"."public"."segmented_by__dbt_tmp" as + + (select * from public.product_dimension) + + segmented by product_key ALL NODES; + + ``` + + + + +### Configuring the UNSEGMENTED ALL NODES clause + +To leverage the`UNSEGMENTED ALL NODES` clause of the `CREATE TABLE` statement, use the `no_segmentation` config parameters in your model. + +#### Using the `no_segmentation` config parameter + + + + + + + +```sql + + {{config(materialized='table',no_segmentation='true')}} + + + select * from public.product_dimension + +``` + + + + + + +```sql + + + create table + "VMart"."public"."ww__dbt_tmp" + + INCLUDE SCHEMA PRIVILEGES as ( + + select * from public.product_dimension ) + + UNSEGMENTED ALL NODES ; + + + + ``` + + + + + + +### Configuring the `PARTITION BY` clause + +To leverage the `PARTITION BY` clause of the `CREATE TABLE` statement, use the `partition_by_string`, `partition_by_active_count` or the `partition_by_group_by_string` config parameters in your model. + +To learn more about partition by clause check [here](https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/SQLReferenceManual/Statements/partition-clause.htm) + +#### Using the `partition_by_string` config parameter + +`partition_by_string` (optinal) accepts a string value of a any one specific `column_name` based on which partitioning of the table data takes place. + + + + + + + + +```sql + + {{ config( materialized='table', partition_by_string='employee_age' )}} + + + select * FROM public.employee_dimension + +``` + + + + + + +```sql + create table "VMart"."public"."test_partition__dbt_tmp" as + + ( select * FROM public.employee_dimension); + + alter table "VMart"."public"."test_partition__dbt_tmp" + + partition BY employee_age + + + ``` + + + + + +#### Using the `partition_by_active_count` config parameter + +`partition_by_active_count` (optional) specifies how many partitions are active for this table. It accepts an integer value. + +:::info Note: + + If you want to pass `partition_by_active_count` parameter then you have to partition the table by passing `partition_by_string` parameter. + +::: + + + + + + + + + +```sql + {{ config( materialized='table', + partition_by_string='employee_age', + partition_by_group_by_string=""" + CASE WHEN employee_age < 5 THEN 1 + WHEN employee_age>50 THEN 2 + ELSE 3 END""", + + partition_by_active_count = 2) }} + + + select * FROM public.employee_dimension + + + ``` + + + + + + +```sql + + create table "VMart"."public"."test_partition__dbt_tmp" as + + ( select * FROM public.employee_dimension ); + + alter table "VMart"."public"."test_partition__dbt_tmp" partition BY employee_ag + + group by CASE WHEN employee_age < 5 THEN 1 + + WHEN employee_age>50 THEN 2 + + ELSE 3 END + + SET ACTIVEPARTITIONCOUNT 2 ; + ``` + + + + +#### Using the `partition_by_group_by_string` config parameter + +`partition_by_group_by_string` parameter(optional) accepts a string, in which user should specify each group cases as a single string. + + This is derived from the `partition_by_string` value. + + `partition_by_group_by_string` parameter is used to merge partitions into separate partition groups. + + +:::info Note: + + If you want to pass `partition_by_group_by_string` parameter then you have to partition the table by passing `partition_by_string` parameter. + +::: + + + + + + + + + + +```sql + + {{config(materialized='table', + partition_by_string='number_of_children', + partition_by_group_by_string=""" + CASE WHEN number_of_children <= 2 THEN 'small_family' + ELSE 'big_family' END""")}} +select * from public.customer_dimension +``` + + + + + + +```sql + create table "VMart"."public"."test_partition__dbt_tmp" INCLUDE SCHEMA PRIVILEGES as + + ( select * from public.customer_dimension ) ; + + alter table "VMart"."public"."test_partition__dbt_tmp" + partition BY number_of_children + group by CASE WHEN number_of_children <= 2 THEN 'small_family' + ELSE 'big_family' END ; + ``` + + + + + +### Configuring the KSAFE clause + +To leverage the `KSAFE` clause of the `CREATE TABLE` statement, use the `ksafe` config parameter in your model. + + + + + + + + +```sql +{{ config( materialized='table', ksafe='1' ) }} + + select * from public.product_dimension + + +``` + + + + + + +```sql + create table "VMart"."public"."segmented_by__dbt_tmp" as + + (select * from public.product_dimension ) + ksafe 1; +``` + + + + diff --git a/website/docs/reference/resource-configs/where.md b/website/docs/reference/resource-configs/where.md index 679fcd83751..dbb3b66e901 100644 --- a/website/docs/reference/resource-configs/where.md +++ b/website/docs/reference/resource-configs/where.md @@ -3,13 +3,6 @@ resource_types: [tests] datatype: string --- - - -* `v0.20.0`: Introduced `where` config -* `v0.21.0`: Introduced `config` property for tests. Reimplemented `where` config with `get_where_subquery` macro - - - ### Definition Filter the resource being tested (model, source, seed, or snapshot). @@ -57,6 +50,10 @@ models: values: ["a", "b", "c"] config: where: "date_column = current_date" + - name: other_column + tests: + - not_null: + where: "date_column < current_date" ``` @@ -123,6 +120,8 @@ tests: ### Custom logic +The rendering context for the `where` config is the same as for all configurations defined in `.yml` files. You have access to `{{ var() }}` and `{{ env_var() }}`, but you **do not** have access to custom macros for setting this config. If you do want to use custom macros to template out the `where` filter for certain tests, there is a workaround. + As of v0.21, dbt defines a [`get_where_subquery` macro](https://github.com/dbt-labs/dbt-core/blob/main/core/dbt/include/global_project/macros/materializations/tests/where_subquery.sql). dbt replaces `{{ model }}` in generic test definitions with `{{ get_where_subquery(relation) }}`, where `relation` is a `ref()` or `source()` for the resource being tested. The default implementation of this macro returns: @@ -131,4 +130,47 @@ dbt replaces `{{ model }}` in generic test definitions with `{{ get_where_subque You can override this behavior by: - Defining a custom `get_where_subquery` in your root project -- Defining a custom `__get_where_subquery` [dispatch candidate](dispatch) in your package or adapter plugin +- Defining a custom `__get_where_subquery` [dispatch candidate](/reference/dbt-jinja-functions/dispatch) in your package or adapter plugin + +Within this macro definition, you can reference whatever custom macros you want, based on static inputs from the configuration. At simplest, this enables you to DRY up code that you'd otherwise need to repeat across many different `.yml` files. Because the `get_where_subquery` macro is resolved at runtime, your custom macros can also include [fetching the results of introspective database queries](https://docs.getdbt.com/reference/dbt-jinja-functions/run_query). + +**Example:** Filter your test to the past three days of data, using dbt's cross-platform [`dateadd()`](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#dateadd) utility macro. + + + +```yml +version: 2 +models: + - name: my_model + columns: + - name: id + tests: + - unique: + config: + where: "date_column > __three_days_ago__" # placeholder string for static config +``` + + + + + +```sql +{% macro get_where_subquery(relation) -%} + {% set where = config.get('where') %} + {% if where %} + {% if "__three_days_ago__" in where %} + {# replace placeholder string with result of custom macro #} + {% set three_days_ago = dbt.dateadd('day', -3, current_timestamp()) %} + {% set where = where | replace("__three_days_ago__", three_days_ago) %} + {% endif %} + {%- set filtered -%} + (select * from {{ relation }} where {{ where }}) dbt_subquery + {%- endset -%} + {% do return(filtered) %} + {%- else -%} + {% do return(relation) %} + {%- endif -%} +{%- endmacro %} +``` + + diff --git a/website/docs/reference/resource-properties/columns.md b/website/docs/reference/resource-properties/columns.md index ab001533c9a..ff8aa8734c6 100644 --- a/website/docs/reference/resource-properties/columns.md +++ b/website/docs/reference/resource-properties/columns.md @@ -26,11 +26,11 @@ models: columns: - name: data_type: - [description](description): - [quote](quote): true | false - [tests](resource-properties/tests): ... - [tags](resource-configs/tags): ... - [meta](resource-configs/meta): ... + [description](/reference/resource-properties/description): + [quote](/reference/resource-properties/quote): true | false + [tests](/reference/resource-properties/tests): ... + [tags](/reference/resource-configs/tags): ... + [meta](/reference/resource-configs/meta): ... - name: ... ``` @@ -52,12 +52,12 @@ sources: - name: columns: - name: - [description](description): + [description](/reference/resource-properties/description): data_type: - [quote](quote): true | false - [tests](resource-properties/tests): ... - [tags](resource-configs/tags): ... - [meta](resource-configs/meta): ... + [quote](/reference/resource-properties/quote): true | false + [tests](/reference/resource-properties/tests): ... + [tags](/reference/resource-configs/tags): ... + [meta](/reference/resource-configs/meta): ... - name: ... @@ -76,18 +76,15 @@ version: 2 seeds: - name: - columns: - name: - columns: - - name: - [description](description): - data_type: - [quote](quote): true | false - [tests](resource-properties/tests): ... - [tags](resource-configs/tags): ... - [meta](resource-configs/meta): ... - - name: + [description](/reference/resource-properties/description): + data_type: + [quote](/reference/resource-properties/quote): true | false + [tests](/reference/resource-properties/tests): ... + [tags](/reference/resource-configs/tags): ... + [meta](/reference/resource-configs/meta): ... + - name: ... ``` @@ -106,12 +103,12 @@ snapshots: - name: columns: - name: - [description](description): + [description](/reference/resource-properties/description): data_type: - [quote](quote): true | false - [tests](resource-properties/tests): ... - [tags](resource-configs/tags): ... - [meta](resource-configs/meta): ... + [quote](/reference/resource-properties/quote): true | false + [tests](/reference/resource-properties/tests): ... + [tags](/reference/resource-configs/tags): ... + [meta](/reference/resource-configs/meta): ... - name: ``` @@ -132,7 +129,7 @@ analyses: - name: columns: - name: - [description](description): + [description](/reference/resource-properties/description): data_type: - name: @@ -150,6 +147,8 @@ Columns are not resources in and of themselves. Instead, they are child properti - `tests` - `description` -Because columns are not resources, their `tags` and `meta` properties are not true configurations. They do not inherit the `tags` or `meta` values of their parent resources. However, you can select a generic test, defined on a column, using tags applied to its column or top-level resource; see [test selection examples](test-selection-examples#run-tests-on-tagged-columns). +Because columns are not resources, their `tags` and `meta` properties are not true configurations. They do not inherit the `tags` or `meta` values of their parent resources. However, you can select a generic test, defined on a column, using tags applied to its column or top-level resource; see [test selection examples](/reference/node-selection/test-selection-examples#run-tests-on-tagged-columns). -Columns may optionally define a `data_type`. This is for metadata purposes only, such as to use alongside the [`external`](resource-properties/external) property of sources. +Columns may optionally define a `data_type`, which is necessary for: +- Enforcing a model [contract](/reference/resource-configs/contract) +- Use in other packages or plugins, such as the [`external`](/reference/resource-properties/external) property of sources and [`dbt-external-tables`](https://hub.getdbt.com/dbt-labs/dbt_external_tables/latest/) diff --git a/website/docs/reference/resource-properties/config.md b/website/docs/reference/resource-properties/config.md index 636651c45b4..e6021def852 100644 --- a/website/docs/reference/resource-properties/config.md +++ b/website/docs/reference/resource-properties/config.md @@ -1,11 +1,10 @@ --- +title: "About config property" +sidebar_label: "config" resource_types: [models, seeds, snapshots, tests, sources, metrics, exposures] datatype: "{dictionary}" --- - - - **v0.21.0** introduced the `config` property - config: - [](model-configs): + [](/reference/model-configs): ... ``` @@ -48,7 +47,7 @@ version: 2 seeds: - name: config: - [](seed-configs): + [](/reference/seed-configs): ... ``` @@ -66,7 +65,7 @@ version: 2 snapshots: - name: config: - [](snapshot-configs): + [](/reference/snapshot-configs): ... ``` @@ -91,14 +90,14 @@ version: 2 : ... - [columns](columns): + [columns](/reference/resource-properties/columns): - name: tests: - [](#test_name) - [](#test_name): : config: - [](test-configs): + [](/reference/test-configs): ... ``` @@ -109,13 +108,6 @@ version: 2 - - -We have added support for the `config` property on sources in dbt Core v1.1 - - - - @@ -125,17 +117,15 @@ version: 2 sources: - name: config: - [](source-configs): + [](/reference/source-configs): tables: - name: config: - [](source-configs): + [](/reference/source-configs): ``` - - @@ -194,4 +184,4 @@ exposures: -The `config` property allows you to configure resources at the same time you're defining properties in yaml files. +The `config` property allows you to configure resources at the same time you're defining properties in YAML files. diff --git a/website/docs/reference/resource-properties/constraints.md b/website/docs/reference/resource-properties/constraints.md new file mode 100644 index 00000000000..4e500ed64ea --- /dev/null +++ b/website/docs/reference/resource-properties/constraints.md @@ -0,0 +1,517 @@ +--- +resource_types: [models] +datatype: "{dictionary}" +--- + +:::info New functionality +This functionality is new in v1.5. +::: + +Constraints are a feature of many data platforms. When specified, the platform will perform additional validation on data as it is being populated in a new table or inserted into a preexisting table. If the validation fails, the table creation or update fails, the operation is rolled back, and you will see a clear error message. + +When enforced, a constraint guarantees that you will never see invalid data in the table materialized by your model. Enforcement varies significantly by data platform. + +Constraints require the declaration and enforcement of a model [contract](/reference/resource-configs/contract). + +**Constraints are never applied on `ephemeral` models or those materialized as `view`**. Only `table` and `incremental` models support applying and enforcing constraints. + +## Defining constraints + +Constraints may be defined for a single column, or at the model level for one or more columns. As a general rule, we recommend defining single-column constraints directly on those columns. + +The structure of a constraint is: +- `type` (required): one of `not_null`, `unique`, `primary_key`, `foreign_key`, `check`, `custom` +- `expression`: Free text input to qualify the constraint. Required for certain constraint types, and optional for others. +- `name` (optional): Human-friendly name for this constraint. Supported by some data platforms. +- `columns` (model-level only): List of column names to apply the constraint over + + + +```yml +models: + - name: + + # required + config: + contract: + enforced: true + + # model-level constraints + constraints: + - type: primary_key + columns: [, , ...] + - type: check + columns: [, , ...] + expression: " != " + name: human_friendly_name + - type: ... + + columns: + - name: + data_type: + + # column-level constraints + constraints: + - type: not_null + - type: unique + - type: foreign_key + expression: . () + - type: ... +``` + + + +## Platform-specific support + +In transactional databases, it is possible to define "constraints" on the allowed values of certain columns, stricter than just the data type of those values. For example, Postgres supports and enforces all the constraints in the ANSI SQL standard (`not null`, `unique`, `primary key`, `foreign key`), plus a flexible row-level `check` constraint that evaluates to a boolean expression. + +Most analytical data platforms support and enforce a `not null` constraint, but they either do not support or do not enforce the rest. It is sometimes still desirable to add an "informational" constraint, knowing it is _not_ enforced, for the purpose of integrating with legacy data catalog or entity-relation diagram tools ([dbt-core#3295](https://github.com/dbt-labs/dbt-core/issues/3295)). + +To that end, there are two optional fields you can specify on any filter: +- `warn_unenforced: False` to skip warning on constraints that are supported, but not enforced, by this data platform. The constraint will be included in templated DDL. +- `warn_unsupported: False` to skip warning on constraints that aren't supported by this data platform, and therefore won't be included in templated DDL. + + + +
          + +* PostgreSQL constraints documentation: [here](https://www.postgresql.org/docs/current/ddl-constraints.html#id-1.5.4.6.6) + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + + + +```yml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: customer_id + data_type: int + constraints: + - type: not_null + - type: primary_key + - type: check + expression: "id > 0" + - name: customer_name + data_type: text + - name: first_transaction_date + data_type: date +``` + + + +Expected DDL to enforce constraints: + + +```sql +create table "database_name"."schema_name"."constraints_example__dbt_tmp" +( + id integer not null primary key check (id > 0), + customer_name text, + first_transaction_date date +) +; +insert into "database_name"."schema_name"."constraints_example__dbt_tmp" +( + id, + customer_name, + first_transaction_date +) +( +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +); +``` + + + +
          + +
          + +Redshift currently only enforces `not null` constraints; all other constraints are metadata only. Additionally, Redshift does not allow column checks at the time of table creation. See more in the Redshift documentation [here](https://docs.aws.amazon.com/redshift/latest/dg/t_Defining_constraints.html). + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + + + +```yml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: id + data_type: integer + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include + - type: check # not supported -- will warn & skip + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced + - name: customer_name + data_type: varchar + - name: first_transaction_date + data_type: date +``` + + + +Expected DDL to enforce constraints: + + +```sql + +create table "database_name"."schema_name"."constraints_example__dbt_tmp" + +( + id integer not null, + customer_name varchar, + first_transaction_date date, + primary key(id) +) +; + +insert into "database_name"."schema_name"."constraints_example__dbt_tmp" +( +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +); +``` + + + + +
          + +
          + +- Snowflake constraints documentation: [here](https://docs.snowflake.com/en/sql-reference/constraints-overview.html) +- Snowflake data types: [here](https://docs.snowflake.com/en/sql-reference/intro-summary-data-types.html) + +Snowflake suppports four types of constraints: `unique`, `not null`, `primary key` and `foreign key`. + +It is important to note that only the `not null` (and the `not null` property of `primary key`) are actually checked today. +The rest of the constraints are purely metadata, not verified when inserting data. + +Currently, Snowflake doesn't support the `check` syntax and dbt will skip the `check` config and raise a warning message if it is set on some models in the dbt project. + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + + + +```yml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: id + data_type: integer + description: hello + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include + - type: check # not supported -- will warn & skip + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced + - name: customer_name + data_type: text + - name: first_transaction_date + data_type: date +``` + + + +Expected DDL to enforce constraints: + + +```sql +create or replace transient table ..constraints_model +( + id integer not null primary key, + customer_name text, + first_transaction_date date +) +as +( +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +); +``` + + + +
          + +
          + +BigQuery allows defining `not null` constraints. However, it does _not_ support or enforce the definition of unenforced constraints, such as `primary key`. + +Documentation: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-definition-language + +Data types: https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + + + +```yml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: id + data_type: int + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include + - type: check # not supported -- will warn & skip + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced + - name: customer_name + data_type: string + - name: first_transaction_date + data_type: date +``` + + + +### Column-level constraint on nested column: + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 'string' as a, + struct( + 1 as id, + 'name' as name, + struct(2 as id, struct('test' as again, '2' as even_more) as another) as double_nested + ) as b +``` + + + + + +```yml +version: 2 + +models: + - name: nested_column_constraints_example + config: + contract: + enforced: true + columns: + - name: a + data_type: string + - name: b.id + data_type: integer + constraints: + - type: not_null + - name: b.name + description: test description + data_type: string + - name: b.double_nested.id + data_type: integer + - name: b.double_nested.another.again + data_type: string + - name: b.double_nested.another.even_more + data_type: integer + constraints: + - type: not_null +``` + + + +### Expected DDL to enforce constraints: + + + +```sql +create or replace table ``.``.`constraints_model` +( + id integer not null, + customer_name string, + first_transaction_date date +) +as +( +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +); +``` + + + +
          + +
          + +Databricks allows you to define: + +- a `not null` constraint +- and/or additional `check` constraints, with conditional expressions including one or more columns + +As Databricks does not support transactions nor allows using `create or replace table` with a column schema, the table is first created without a schema and `alter` statements are then executed to add the different constraints. + +This means that: + +- The names and order of columns is checked but not their type +- If the `constraints` and/or `constraint_check` fails, the table with the failing data will still exist in the Warehouse + +See [this page](https://docs.databricks.com/tables/constraints.html) with more details about the support of constraints on Databricks. + + + +```sql +{{ + config( + materialized = "table" + ) +}} + +select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + + + +```yml +models: + - name: dim_customers + config: + contract: + enforced: true + columns: + - name: id + data_type: int + constraints: + - type: not_null + - type: primary_key # not enforced -- will warn & include + - type: check # not supported -- will warn & skip + expression: "id > 0" + tests: + - unique # primary_key constraint is not enforced + - name: customer_name + data_type: text + - name: first_transaction_date + data_type: date +``` + + + +Expected DDL to enforce constraints: + + +```sql + create or replace table schema_name.my_model + using delta + as + select + 1 as id, + 'My Favorite Customer' as customer_name, + cast('2019-01-01' as date) as first_transaction_date +``` + + + +Followed by the statements + +```sql +alter table schema_name.my_model change column id set not null; +alter table schema_name.my_model add constraint 472394792387497234 check (id > 0); +``` + +
          + +
          diff --git a/website/docs/reference/resource-properties/database.md b/website/docs/reference/resource-properties/database.md index c738eade322..c2f6ba76dd8 100644 --- a/website/docs/reference/resource-properties/database.md +++ b/website/docs/reference/resource-properties/database.md @@ -1,4 +1,6 @@ --- +title: "Defining a database source property" +sidebar_label: "database" resource_types: sources datatype: database_name --- diff --git a/website/docs/reference/resource-properties/deprecation_date.md b/website/docs/reference/resource-properties/deprecation_date.md new file mode 100644 index 00000000000..830412d2af6 --- /dev/null +++ b/website/docs/reference/resource-properties/deprecation_date.md @@ -0,0 +1,86 @@ +--- +resource_types: [models] +datatype: deprecation_date +required: no +--- + + + +```yml +models: + - name: my_model + description: deprecated + deprecation_date: 1999-01-01 00:00:00.00+00:00 +``` + + + + +```yml +version: 2 +models: + - name: my_model + description: deprecating in the future + deprecation_date: 2999-01-01 00:00:00.00+00:00 +``` + + + +## Definition + +The deprecation date of the model is formatted as a date, optionally with a timezone offset. Supported RFC 3339 formats include: +- `YYYY-MM-DD hh:mm:ss.sss±hh:mm` +- `YYYY-MM-DD hh:mm:ss.sss` +- `YYYY-MM-DD` + +When `deprecation_date` does not include an offset from UTC, then it is interpreted as being in the system time zone of the dbt execution environment. + +## Explanation + +### Purpose + +Declaring a `deprecation_date` for a dbt model provides a mechanism to communicate plans and timelines for long-term support and maintenance and to facilitate change management. + +Setting a `deprecation_date` works well in conjunction with other [model governance](/docs/collaborate/govern/about-model-governance) features like [model versions](/docs/collaborate/govern/model-versions), but can also be used independently from them. + +### Warning messages + +When a project references a model that's slated for deprecation or the deprecation date has passed, a warning is generated. If it's a versioned model, with a newer version available, then the warning says so. This added bit of cross-team communication, from producers to consumers, is an advantage of using dbt's built-in functionality around model versions to facilitate migrations. + +Additionally, [`WARN_ERROR_OPTIONS`](/reference/global-configs/warnings) gives a mechanism whereby users can promote these warnings to actual runtime errors: + +| Warning | Scenario | Affected projects | +|--------------------------------|----------------------------------------------------|------------------------| +| `DeprecatedModel` | Parsing a project that defines a deprecated model | Producer | +| `DeprecatedReference` | Referencing a model with a past deprecation date | Producer and consumers | +| `UpcomingDeprecationReference` | Referencing a model with a future deprecation date | Producer and consumers | + +** Example ** + +Example output for an `UpcomingDeprecationReference` warning: +``` +$ dbt parse +15:48:14 Running with dbt=1.6.0 +15:48:14 Registered adapter: postgres=1.6.0 +15:48:14 [WARNING]: While compiling 'my_model_ref': Found a reference to my_model, which is slated for deprecation on '2038-01-19T03:14:07-00:00'. +``` + +### Selection syntax + +There is not specific [node selection syntax](/reference/node-selection/syntax) for `deprecation_date`. [Programmatic invocations](/reference/programmatic-invocations) is one way to identify deprecated models (potentially in conjunction with [dbt list](/reference/commands/list)). e.g., `dbt -q ls --output json --output-keys database schema alias deprecation_date`. + +### Deprecation process + +Additional steps are necessary to save on build-related compute and storage costs for a deprecated model. + +Deprecated models can continue to be built by producers and be selected by consumers until they are [disabled](/reference/resource-configs/enabled) or removed. + +Just like it does not automatically [drop relations when models are deleted](/faqs/models/removing-deleted-models), dbt does not drop relations for deprecated models. + +Strategies similar to [here](https://discourse.getdbt.com/t/faq-cleaning-up-removed-models-from-your-production-schema/113) or [here](https://discourse.getdbt.com/t/clean-your-warehouse-of-old-and-deprecated-models/1547) can be used to drop relations that have been deprecated and are no longer in use. + +### Table expiration on BigQuery + +dbt-bigquery can set an [`hours_to_expiration`](/reference/resource-configs/bigquery-configs#controlling-table-expiration) that translates to `expiration_timestamp` within BigQuery. + +dbt does not automatically synchronize `deprecation_date` and `hours_to_expiration`, but users may want to coordinate them in some fashion (such as setting a model to expire 48 hours after its `deprecation_date`). Expired tables in BigQuery will be deleted and their storage reclaimed. diff --git a/website/docs/reference/resource-properties/description.md b/website/docs/reference/resource-properties/description.md index 9a5342473d7..f8469dc3494 100644 --- a/website/docs/reference/resource-properties/description.md +++ b/website/docs/reference/resource-properties/description.md @@ -157,13 +157,13 @@ A user-defined description. Can be used to document: - analyses, and analysis columns - macros, and macro arguments -These descriptions are used in the documentation website rendered by dbt (see [the documentation guide](documentation)). +These descriptions are used in the documentation website rendered by dbt (see [the documentation guide](/docs/collaborate/documentation)). -Descriptions can include markdown, as well as the [`doc` jinja function](doc). +Descriptions can include markdown, as well as the [`doc` jinja function](/reference/dbt-jinja-functions/doc). :::caution You may need to quote your YAML -Be mindful of YAML semantics when providing a description. If your description contains special yaml characters like curly brackets, colons, or square brackets, you may need to quote your description. An example of a quoted description is shown [below](#use-some-markdown-in-a-description). +Be mindful of YAML semantics when providing a description. If your description contains special YAML characters like curly brackets, colons, or square brackets, you may need to quote your description. An example of a quoted description is shown [below](#use-some-markdown-in-a-description). ::: @@ -235,7 +235,7 @@ models: ### Use a docs block in a description -If you have a long description, especially if it contains markdown, it may make more sense to leverage a [`docs` block](doc). A benefit of this approach is that code editors will correctly highlight markdown, making it easier to debug as you write. +If you have a long description, especially if it contains markdown, it may make more sense to leverage a [`docs` block](/reference/dbt-jinja-functions/doc). A benefit of this approach is that code editors will correctly highlight markdown, making it easier to debug as you write. @@ -307,7 +307,7 @@ models: ### Include an image from your repo in your descriptions To include an image from your repository in your descriptions: 1. Add the file in a subdirectory, e.g. `assets/dbt-logo.svg` -2. Set the [`asset-paths` config](project-configs/asset-paths) in your `dbt_project.yml` file so that this directory gets copied to the `target/` directory as part of `dbt docs generate` +2. Set the [`asset-paths` config](/reference/project-configs/asset-paths) in your `dbt_project.yml` file so that this directory gets copied to the `target/` directory as part of `dbt docs generate` diff --git a/website/docs/reference/resource-properties/external.md b/website/docs/reference/resource-properties/external.md index a058d4dc685..114e09efbaa 100644 --- a/website/docs/reference/resource-properties/external.md +++ b/website/docs/reference/resource-properties/external.md @@ -36,6 +36,6 @@ the Hive external spec. You may define and use as many addit You may wish to define the `external` property in order to: - Power macros that introspect [`graph.sources`](/reference/dbt-jinja-functions/graph) -- Define metadata that you can later extract from the [manifest](manifest-json) +- Define metadata that you can later extract from the [manifest](/reference/artifacts/manifest-json) For an example of how this property can be used to power custom workflows, see the [`dbt-external-tables`](https://github.com/dbt-labs/dbt-external-tables) package. diff --git a/website/docs/reference/resource-properties/freshness.md b/website/docs/reference/resource-properties/freshness.md index d5f803419dd..f332f5a1b8f 100644 --- a/website/docs/reference/resource-properties/freshness.md +++ b/website/docs/reference/resource-properties/freshness.md @@ -85,13 +85,7 @@ This filter *only* applies to dbt's source freshness queries - it will not impac This is particularly useful if: - You are using BigQuery and your source tables are [partitioned tables](https://cloud.google.com/bigquery/docs/partitioned-tables) -- You are using Snowflake or Spark with large tables, and this results in a performance benefit - - - -* `v0.15.0`: This property was introduced - - +- You are using Snowflake, Databricks or Spark with large tables, and this results in a performance benefit ## Examples diff --git a/website/docs/reference/resource-properties/include-exclude.md b/website/docs/reference/resource-properties/include-exclude.md new file mode 100644 index 00000000000..90863623b0d --- /dev/null +++ b/website/docs/reference/resource-properties/include-exclude.md @@ -0,0 +1,105 @@ +--- +resource_types: [models] +title: include +required: no +--- + + + + +```yml +version: 2 + +models: + + # top-level model properties + - name: + [columns](/reference/resource-properties/columns): + - name: # required + + # versions of this model + [versions](/reference/resource-properties/versions): + - v: # required + columns: + - include: '*' | 'all' | [, ...] + exclude: + - + - ... # declare additional column names to exclude + + # declare more columns -- can be overrides from top-level, or in addition + - name: + ... + +``` + + + +## Definition +The specification of which columns are defined in a model's top-level `columns` property to include or exclude in a versioned implementation of that model. + +`include` is either: +- a list of specific column names to include +- `'*'` or `'all'`, indicating that **all** columns from the top-level `columns` property should be included in the versioned model + +`exclude` is a list of column names to exclude. It can only be declared if `include` is set to one of `'*'` or `'all'`. + +The `columns` list of a versioned model can have _at most one_ `include/exclude` element. + +You may declare additional columns within the version's `columns` list. If a version-specific column's `name` matches a column included from the top level, the version-specific entry will override that column for that version. + +## Default + +By default, `include` is "all", and `exclude` is the empty list. This has the effect of including all columns from the base model in the versioned model. + +## Example + + + +```yml +models: + - name: customers + columns: + - name: customer_id + description: Unique identifier for this table + data_type: text + constraints: + - type: not_null + tests: + - unique + - name: customer_country + data_type: text + description: "Country where the customer currently lives" + - name: first_purchase_date + data_type: date + + versions: + - v: 4 + + - v: 3 + columns: + - include: "*" + - name: customer_country + data_type: text + description: "Country where the customer first lived at time of first purchase" + + - v: 2 + columns: + - include: "*" + exclude: + - customer_country + + - v: 1 + columns: + - include: [] + - name: id + data_type: int +``` + + + +Because `v4` has not specified any `columns`, it will include all of the top-level `columns`. + +Each other version has declared a modification from the top-level property: +- `v3` will include all columns, but it reimplements the `customer_country` column with a different `description` +- `v2` will include all columns *except* `customer_country` +- `v1` doesn't include *any* of the top-level `columns`. Instead, it declares only a single integer column named `id`. diff --git a/website/docs/reference/resource-properties/latest_version.md b/website/docs/reference/resource-properties/latest_version.md new file mode 100644 index 00000000000..4c531879598 --- /dev/null +++ b/website/docs/reference/resource-properties/latest_version.md @@ -0,0 +1,68 @@ +--- +resource_types: [models] +datatype: latest_version +required: no +--- + + + +```yml +models: + - name: model_name + latest_version: 2 + [versions](/reference/resource-properties/versions): + - v: 2 + - v: 1 +``` + + + +## Definition + +The latest version of this model. The "latest" version is relevant for: +1. Resolving `ref()` calls to this model that are "unpinned" (a version is not explicitly specified) +2. Selecting model versions using the [`version:` selection method](/reference/node-selection/methods#the-version-method), based on whether a given model version is `latest`, `prerelease`, or `old` + +This value can be a string or a numeric (integer or float) value. It must be one of the [version identifiers](/reference/resource-properties/versions#v) specified in this model's list of `versions`. + +## Default + +If not specified for a versioned model, `latest_version` defaults to the largest [version identifier](/reference/resource-properties/versions#v): numerically greatest (if all version identifiers are numeric), otherwise the alphabetically last (if they are strings). + +For a non-versioned model (no `versions` list), `latest_version` has no value. + +If `latest_version` is not specified for a versioned model, `latest_version` defaults to the largest. + + +## Example + + + +```yml +models: + - name: model_name + [versions](/reference/resource-properties/versions): + - v: 3 + - v: 2 + - v: 1 +``` + + + +If `latest_version` is not specified, the `latest_version` is `3`. Any unpinned references -- `ref('model_name')` -- will resolve to `model_name.v3`. Both `v1` and `v2` are considered "old" versions. + + + +```yml +models: + - name: model_name + latest_version: 2 + [versions](/reference/resource-properties/versions): + - v: 3 + - v: 2 + - v: 1 +``` + + + +In this case, the `latest_version` is explicitly set to `2`. Any unpinned references will resolve to `model_name.v2`. `v3` is considered "prerelease", and `v1` is considered "old". diff --git a/website/docs/reference/resource-properties/overrides.md b/website/docs/reference/resource-properties/overrides.md index e36e42d9753..ba3a76cbea5 100644 --- a/website/docs/reference/resource-properties/overrides.md +++ b/website/docs/reference/resource-properties/overrides.md @@ -24,15 +24,15 @@ in the overriding source will be applied on top of the base properties of the overridden source. The following source properties can be overridden: - - [description](resource-properties/description) - - [meta](resource-configs/meta) - - [database](resource-properties/database) - - [schema](resource-properties/schema) - - [loader](resource-properties/loader) - - [quoting](resource-properties/quoting) - - [freshness](resource-properties/freshness) - - [loaded_at_field](resource-properties/freshness#loaded_at_field) - - [tags](resource-configs/tags) + - [description](/reference/resource-properties/description) + - [meta](/reference/resource-configs/meta) + - [database](/reference/resource-properties/database) + - [schema](/reference/resource-properties/schema) + - [loader](/reference/resource-properties/loader) + - [quoting](/reference/resource-properties/quoting) + - [freshness](/reference/resource-properties/freshness) + - [loaded_at_field](/reference/resource-properties/freshness#loaded_at_field) + - [tags](/reference/resource-configs/tags) ## Examples ### Supply your database and schema name for a source defined in a package diff --git a/website/docs/reference/resource-properties/quote.md b/website/docs/reference/resource-properties/quote.md index 653e9b75343..50bf4c08c40 100644 --- a/website/docs/reference/resource-properties/quote.md +++ b/website/docs/reference/resource-properties/quote.md @@ -115,12 +115,6 @@ analyses: ## Definition The `quote` field can be used to enable or disable quoting for column names. - - -* `v0.16.0`: This configuration was added - - - ## Default The default quoting value is `false` @@ -129,7 +123,7 @@ This is particularly relevant to those using Snowflake, where quoting can be par This property is useful when: - A source has a column that needs to be quoted to be selected, for example, to preserve column casing -- A seed was created with `quote_columns: true` ([docs](quote_columns)) on Snowflake +- A seed was created with `quote_columns: true` ([docs](/reference/resource-configs/quote_columns)) on Snowflake - A model uses quotes in the SQL, potentially to work around the use of reserved words ```sql select user_group as "group" diff --git a/website/docs/reference/resource-properties/quoting.md b/website/docs/reference/resource-properties/quoting.md index 5cb84f53789..eef4a3749fd 100644 --- a/website/docs/reference/resource-properties/quoting.md +++ b/website/docs/reference/resource-properties/quoting.md @@ -1,4 +1,6 @@ --- +title: "Configuring quoting in sources" +sidebar_label: "quoting" datatype: boolean # -ish, it's actually a dictionary of bools default: true --- diff --git a/website/docs/reference/resource-properties/schema.md b/website/docs/reference/resource-properties/schema.md index c7e70d6e318..9e6a09b8569 100644 --- a/website/docs/reference/resource-properties/schema.md +++ b/website/docs/reference/resource-properties/schema.md @@ -1,4 +1,6 @@ --- +title: "Defining a schema source property" +sidebar_label: "schema" resource_types: sources datatype: schema_name --- diff --git a/website/docs/reference/resource-properties/tests.md b/website/docs/reference/resource-properties/tests.md index da78376f57b..6e2c02c6bc5 100644 --- a/website/docs/reference/resource-properties/tests.md +++ b/website/docs/reference/resource-properties/tests.md @@ -1,6 +1,9 @@ --- +title: "About tests property" +sidebar_label: "tests" resource_types: all datatype: test +keywords: [test, tests, custom tests, custom test name, test name] --- ](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): - [columns](columns): + [columns](/reference/resource-properties/columns): - name: tests: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): ``` @@ -58,8 +61,8 @@ sources: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): columns: - name: @@ -67,8 +70,8 @@ sources: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): ``` @@ -89,8 +92,8 @@ seeds: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): columns: - name: @@ -98,8 +101,8 @@ seeds: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): ``` @@ -120,8 +123,8 @@ snapshots: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): columns: - name: @@ -129,8 +132,8 @@ snapshots: - [](#test_name) - [](#test_name): : - [config](resource-properties/config): - [](test-configs): + [config](/reference/resource-properties/config): + [](/reference/test-configs): ``` @@ -153,7 +156,7 @@ This feature is not implemented for analyses. ## Description -The `tests` property defines assertions about a column, , or . The property contains a list of [generic tests](/docs/build/tests#generic-tests), referenced by name, which can include the four built-in generic tests available in dbt. For example, you can add tests that ensure a column contains no duplicates and zero null values. Any arguments or [configurations](test-configs) passed to those tests should be nested below the test name. +The `tests` property defines assertions about a column, , or . The property contains a list of [generic tests](/docs/build/tests#generic-tests), referenced by name, which can include the four built-in generic tests available in dbt. For example, you can add tests that ensure a column contains no duplicates and zero null values. Any arguments or [configurations](/reference/test-configs) passed to those tests should be nested below the test name. Once these tests are defined, you can validate their correctness by running `dbt test`. @@ -254,12 +257,12 @@ models: -The `to` argument accepts a [Relation](dbt-classes#relation) – this means you can pass it a `ref` to a model (e.g. `ref('customers')`), or a `source` (e.g. `source('jaffle_shop', 'customers')`). +The `to` argument accepts a [Relation](/reference/dbt-classes#relation) – this means you can pass it a `ref` to a model (e.g. `ref('customers')`), or a `source` (e.g. `source('jaffle_shop', 'customers')`). ## Additional examples -### Testing an expression -Some tests require multiple columns, so it doesn't make sense to nest them under the `columns:` key. In this case you can apply the test to the model (or source, seed or snapshot) instead: +### Test an expression +Some tests require multiple columns, so it doesn't make sense to nest them under the `columns:` key. In this case, you can apply the test to the model (or source, seed, or snapshot) instead: @@ -275,7 +278,7 @@ models: -### Define and use a custom generic test +### Use custom generic test If you've defined your own custom generic test, you can use that as the `test_name`: @@ -295,11 +298,9 @@ models:
          -Check out the guide on writing a [custom generic test](custom-generic-tests) for more information. +Check out the guide on writing a [custom generic test](/guides/best-practices/writing-custom-generic-tests) for more information. - - -### Define a custom name for one test +### Custom test name By default, dbt will synthesize a name for your generic test by concatenating: - test name (`not_null`, `unique`, etc) @@ -349,7 +350,7 @@ $ dbt test --select unexpected_order_status_today 12:43:41 Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1 ``` -A test's name must be unique for all tests defined on a given model-column combination. If you give the same name to tests defined on several different columns, or across several different models, then `dbt test --select ` will select them all. +A test's name must be unique for all tests defined on a given model-column combination. If you give the same name to tests defined on several different columns, or across several different models, then `dbt test --select ` will select them all. **When might you need this?** In cases where you have defined the same test twice, with only a difference in configuration, dbt will consider these tests to be duplicates: @@ -388,7 +389,7 @@ Compilation Error - test.testy.accepted_values_orders_status__placed__shipped__completed__returned.69dce9e5d5 (models/one_file.yml) ``` -By providing a custom name, you enable dbt to disambiguate them: +By providing a custom name, you help dbt differentiate tests: @@ -433,15 +434,11 @@ $ dbt test 12:48:04 Done. PASS=2 WARN=0 ERROR=0 SKIP=0 TOTAL=2 ``` -**If using [`store_failures`](resource-configs/store_failures):** dbt uses each test's name as the name of the table in which to store any failing records. If you have defined a custom name for one test, that custom name will also be used for its table of failures. You may optionally configure an [`alias`](resource-configs/alias) for the test, in order to separately control both the name of the test (for metadata) and the name of its database table (for storing failures). - - - - +**If using [`store_failures`](/reference/resource-configs/store_failures):** dbt uses each test's name as the name of the table in which to store any failing records. If you have defined a custom name for one test, that custom name will also be used for its table of failures. You may optionally configure an [`alias`](/reference/resource-configs/alias) for the test, to separately control both the name of the test (for metadata) and the name of its database table (for storing failures). ### Alternative format for defining tests -When defining a generic test with a number of arguments and configurations, the YAML can look and feel unwieldy. If you find it easier, you can define the same test properties as top-level keys of a single dictionary, by providing the test name as `test_name` instead. It's totally up to you. +When defining a generic test with several arguments and configurations, the YAML can look and feel unwieldy. If you find it easier, you can define the same test properties as top-level keys of a single dictionary, by providing the test name as `test_name` instead. It's totally up to you. This example is identical to the one above: @@ -467,5 +464,3 @@ models: ``` - - diff --git a/website/docs/reference/resource-properties/versions.md b/website/docs/reference/resource-properties/versions.md new file mode 100644 index 00000000000..86e9abf34a8 --- /dev/null +++ b/website/docs/reference/resource-properties/versions.md @@ -0,0 +1,126 @@ +--- +resource_types: [models] +datatype: list +required: no +keyword: governance, model version, model versioning, dbt model versioning +--- + +import VersionsCallout from '/snippets/_version-callout.md'; + + + + + +```yml +version: 2 + +models: + - name: model_name + versions: + - v: # required + defined_in: # optional -- default is _v + columns: + # specify all columns, or include/exclude columns from the top-level model YAML definition + - [include](/reference/resource-properties/include-exclude): + [exclude](/reference/resource-properties/include-exclude): + # specify additional columns + - name: # required + - v: ... + + # optional + [latest_version](/reference/resource-properties/latest_version): +``` + + + +The standard convention for naming model versions is `_v`. This holds for the file where dbt expects to find the model's definition (SQL or Python), and the alias it will use by default when materializing the model in the database. + +### `v` + +The version identifier for a version of a model. This value can be numeric (integer or float), or any string. + +The value of the version identifier is used to order versions of a model relative to one another. If a versioned model does _not_ explicitly configure a [`latest_version`](/reference/resource-properties/latest_version), the highest version number is used as the latest version to resolve `ref` calls to the model without a `version` argument. + +In general, we recommend that you use a simple "major versioning" scheme for your models: `1`, `2`, `3`, and so on, where each version reflects a breaking change from previous versions. You are able to use other versioning schemes. dbt will sort your version identifiers alphabetically if the values are not all numeric. You should **not** include the letter `v` in the version identifier, as dbt will do that for you. + +### `defined_in` + +The name of the model file (excluding the file extension, e.g. `.sql` or `.py`) where the model version is defined. + +If `defined_in` is not specified, dbt searches for the definition of a versioned model in a model file named `_v`. The **latest** version of a model may also be defined in a file named ``, without the version suffix. Model file names must be globally unique, even when defining versioned implementations of a model with a different name. + +### `alias` + +The default resolved `alias` for a versioned model is `_v`. The logic for this is encoded in the `generate_alias_name` macro. + +This default can be overwritten in two ways: +- Configuring a custom `alias` within the version yaml, or the versioned model's definition +- Overwriting dbt's `generate_alias_name` macro, to use different behavior based on `node.version` + +See ["Custom aliases"](https://docs.getdbt.com/docs/build/custom-aliases) for more details. + +Note that the value of `defined_in` and the `alias` configuration of a model are not coordinated, except by convention. The two are declared and determined independently. + +### Our recommendations +- Follow a consistent naming convention for model versions and aliases. +- Use `defined_in` and `alias` only if you have good reason. +- Create a view that always points to the latest version of your model. You can automate this for all versioned models in your project with an `on-run-end` hook. For more details, read the full docs on ["Model versions"](/docs/collaborate/govern/model-versions#configuring-database-location-with-alias) + +### Detecting breaking changes + +When you use the `state:modified` selection method in Slim CI, dbt will detect changes to versioned model contracts, and raise an error if any of those changes could be breaking for downstream consumers. + +Breaking changes include: +- Removing an existing column +- Changing the `data_type` of an existing column +- Removing or modifying one of the `constraints` on an existing column (dbt v1.6 or higher) +- Changing unversioned, contracted models. + - dbt also warns if a model has or had a contract but isn't versioned + + + + + +``` + Breaking Change to Unversioned Contract for contracted_model (models/contracted_models/contracted_model.sql) + While comparing to previous project state, dbt detected a breaking change to an unversioned model. + - Contract enforcement was removed: Previously, this model's configuration included contract: {enforced: true}. It is no longer configured to enforce its contract, and this is a breaking change. + - Columns were removed: + - color + - date_day + - Enforced column level constraints were removed: + - id (ConstraintType.not_null) + - id (ConstraintType.primary_key) + - Enforced model level constraints were removed: + - ConstraintType.check -> ['id'] + - Materialization changed with enforced constraints: + - table -> view +``` + + + + +``` +Breaking Change to Contract Error in model sometable (models/sometable.sql) + While comparing to previous project state, dbt detected a breaking change to an enforced contract. + + The contract's enforcement has been disabled. + + Columns were removed: + - order_name + + Columns with data_type changes: + - order_id (number -> int) + + Consider making an additive (non-breaking) change instead, if possible. + Otherwise, create a new model version: https://docs.getdbt.com/docs/collaborate/govern/model-versions +``` + + + + + + +Additive changes are **not** considered breaking: +- Adding a new column to a contracted model +- Adding new `constraints` to an existing column in a contracted model diff --git a/website/docs/reference/seed-configs.md b/website/docs/reference/seed-configs.md index b7da013a133..429aa9444ae 100644 --- a/website/docs/reference/seed-configs.md +++ b/website/docs/reference/seed-configs.md @@ -1,12 +1,8 @@ --- title: Seed configurations +description: "Read this guide to learn about using seed configurations in dbt." --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure seeds in all `.yml` files - - - ## Available configurations ### Seed-specific configurations @@ -24,9 +20,10 @@ title: Seed configurations ```yml seeds: - [](resource-path): - [+](plus-prefix)[quote_columns](resource-configs/quote_columns): true | false - [+](plus-prefix)[column_types](resource-configs/column_types): {column_name: datatype} + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[quote_columns](/reference/resource-configs/quote_columns): true | false + [+](/reference/resource-configs/plus-prefix)[column_types](/reference/resource-configs/column_types): {column_name: datatype} + [+](/reference/resource-configs/plus-prefix)[delimiter](/reference/resource-configs/delimiter): ``` @@ -45,8 +42,9 @@ version: 2 seeds: - name: [] config: - [quote_columns](resource-configs/quote_columns): true | false - [column_types](resource-configs/column_types): {column_name: datatype} + [quote_columns](/reference/resource-configs/quote_columns): true | false + [column_types](/reference/resource-configs/column_types): {column_name: datatype} + [delimiter](/reference/resource-configs/grants): ``` @@ -73,18 +71,18 @@ seeds: ```yaml seeds: - [](resource-path): - [+](plus-prefix)[enabled](enabled): true | false - [+](plus-prefix)[tags](resource-configs/tags): | [] - [+](plus-prefix)[pre-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[post-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[database](resource-configs/database): - [+](plus-prefix)[schema](resource-configs/schema): - [+](plus-prefix)[alias](resource-configs/alias): - [+](plus-prefix)[persist_docs](persist_docs): - [+](plus-prefix)[full_refresh](full_refresh): - [+](plus-prefix)[meta](meta): {} - [+](plus-prefix)[grants](grants): {} + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[enabled](/reference/resource-configs/enabled): true | false + [+](/reference/resource-configs/plus-prefix)[tags](/reference/resource-configs/tags): | [] + [+](/reference/resource-configs/plus-prefix)[pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[database](/reference/resource-configs/database): + [+](/reference/resource-configs/plus-prefix)[schema](/reference/resource-properties/schema): + [+](/reference/resource-configs/plus-prefix)[alias](/reference/resource-configs/alias): + [+](/reference/resource-configs/plus-prefix)[persist_docs](/reference/resource-configs/persist_docs): + [+](/reference/resource-configs/plus-prefix)[full_refresh](/reference/resource-configs/full_refresh): + [+](/reference/resource-configs/plus-prefix)[meta](/reference/resource-configs/meta): {} + [+](/reference/resource-configs/plus-prefix)[grants](/reference/resource-configs/grants): {} ``` @@ -95,7 +93,7 @@ seeds: - + ```yaml version: 2 @@ -103,17 +101,17 @@ version: 2 seeds: - name: [] config: - [enabled](enabled): true | false - [tags](resource-configs/tags): | [] - [pre-hook](pre-hook-post-hook): | [] - [post-hook](pre-hook-post-hook): | [] - [database](resource-configs/database): - [schema](resource-configs/schema): - [alias](resource-configs/alias): - [persist_docs](persist_docs): - [full_refresh](full_refresh): - [meta](meta): {} - [grants](grants): {} + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [database](/reference/resource-configs/database): + [schema](/reference/resource-properties/schema): + [alias](/reference/resource-configs/alias): + [persist_docs](/reference/resource-configs/persist_docs): + [full_refresh](/reference/resource-configs/full_refresh): + [meta](/reference/resource-configs/meta): {} + [grants](/reference/resource-configs/grants): {} ``` @@ -124,7 +122,7 @@ seeds: ## Configuring seeds -Seeds can only be configured from yaml files, either in `dbt_project.yml` or within an individual seed's yaml properties. It is not possible to configure a seed from within its CSV file. +Seeds can only be configured from YAML files, either in `dbt_project.yml` or within an individual seed's YAML properties. It is not possible to configure a seed from within its CSV file. Seed configurations, like model configurations, are applied hierarchically — configurations applied to a `marketing` subdirectory will take precedence over configurations applied to the entire `jaffle_shop` project, and configurations defined in a specific seed's properties will override configurations defined in `dbt_project.yml`. @@ -144,7 +142,7 @@ seeds: #### Apply the `schema` configuration to all seeds in your project -To apply a configuration to all seeds in your project only (i.e. _excluding_ any seeds in installed packages), provide your [project name](project-configs/name.md) as part of the resource path. +To apply a configuration to all seeds in your project only (i.e. _excluding_ any seeds in installed packages), provide your [project name](/reference/project-configs/name.md) as part of the resource path. For a project named `jaffle_shop`: diff --git a/website/docs/reference/seed-properties.md b/website/docs/reference/seed-properties.md index 7fbed82dc27..85e7be21ae1 100644 --- a/website/docs/reference/seed-properties.md +++ b/website/docs/reference/seed-properties.md @@ -2,11 +2,6 @@ title: Seed properties --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure seeds in all `.yml` files - - **v1.0.0:** The default path for [`seed-paths`](seed-paths) (formerly `data-paths`) is now `seeds`. - - Seed properties can be declared in `.yml` files under a `seed` key. We recommend that you put them in the `seeds/` directory. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within that directory. @@ -18,21 +13,21 @@ version: 2 seeds: - name: - [description](description): + [description](/reference/resource-properties/description): [docs](/reference/resource-configs/docs): show: true | false - [config](resource-properties/config): - [](seed-configs): - [tests](resource-properties/tests): + [config](/reference/resource-properties/config): + [](/reference/seed-configs): + [tests](/reference/resource-properties/tests): - - ... # declare additional tests columns: - name: - [description](description): - [meta](meta): {} - [quote](quote): true | false - [tags](resource-configs/tags): [] - [tests](resource-properties/tests): + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [quote](/reference/resource-properties/quote): true | false + [tags](/reference/resource-configs/tags): [] + [tests](/reference/resource-properties/tests): - - ... # declare additional tests @@ -41,9 +36,3 @@ seeds: - name: ... # declare properties of additional seeds ``` - - - -* `v0.16.0`: The ability to declare seed properties was introduced. Prior to this, you could declare seed properties under the `models:` key (confusing, right?). Support for declaring seed properties under a `models:` key will be removed in a future release. - - diff --git a/website/docs/reference/snapshot-configs.md b/website/docs/reference/snapshot-configs.md index 0f0c629658f..59f4e3c254e 100644 --- a/website/docs/reference/snapshot-configs.md +++ b/website/docs/reference/snapshot-configs.md @@ -1,9 +1,10 @@ --- title: Snapshot configurations +description: "Read this guide to learn about using snapshot configurations in dbt." --- ## Related documentation -* [Snapshots](snapshots) -* The `dbt snapshot` [command](snapshot) +* [Snapshots](/docs/build/snapshots) +* The `dbt snapshot` [command](/reference/commands/snapshot) @@ -74,12 +77,12 @@ snapshots: ```jinja {{ config( - [target_schema](target_schema)="", - [target_database](target_database)="", - [unique_key](unique_key)="", - [strategy](strategy)="timestamp" | "check", - [updated_at](updated_at)="", - [check_cols](check_cols)=[""] | "all" + [target_schema](/reference/resource-configs/target_schema)="", + [target_database](/reference/resource-configs/target_database)="", + [unique_key](/reference/resource-configs/unique_key)="", + [strategy](/reference/resource-configs/strategy)="timestamp" | "check", + [updated_at](/reference/resource-configs/updated_at)="", + [check_cols](/reference/resource-configs/check_cols)=[""] | "all" ) }} ``` @@ -107,13 +110,14 @@ snapshots: ```yaml snapshots: - [](resource-path): - [+](plus-prefix)[enabled](enabled): true | false - [+](plus-prefix)[tags](resource-configs/tags): | [] - [+](plus-prefix)[pre-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[post-hook](pre-hook-post-hook): | [] - [+](plus-prefix)[persist_docs](persist_docs): {} - [+](plus-prefix)[grants](grants): {} + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[enabled](/reference/resource-configs/enabled): true | false + [+](/reference/resource-configs/plus-prefix)[tags](/reference/resource-configs/tags): | [] + [+](/reference/resource-configs/plus-prefix)[alias](/reference/resource-configs/alias): + [+](/reference/resource-configs/plus-prefix)[pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [+](/reference/resource-configs/plus-prefix)[persist_docs](/reference/resource-configs/persist_docs): {} + [+](/reference/resource-configs/plus-prefix)[grants](/reference/resource-configs/grants): {} ``` @@ -129,12 +133,13 @@ version: 2 snapshots: - name: [] config: - [enabled](enabled): true | false - [tags](resource-configs/tags): | [] - [pre-hook](pre-hook-post-hook): | [] - [post-hook](pre-hook-post-hook): | [] - [persist_docs](persist_docs): {} - [grants](grants): {} + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [alias](/reference/resource-configs/alias): + [pre-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [post-hook](/reference/resource-configs/pre-hook-post-hook): | [] + [persist_docs](/reference/resource-configs/persist_docs): {} + [grants](/reference/resource-configs/grants): {} ``` @@ -147,12 +152,13 @@ snapshots: ```jinja {{ config( - [enabled](enabled)=true | false, - [tags](resource-configs/tags)="" | [""], - [pre_hook](pre-hook-post-hook)="" | [""], - [post_hook](pre-hook-post-hook)="" | [""] - [persist_docs](persist_docs)={} - [grants](grants)={} + [enabled](/reference/resource-configs/enabled)=true | false, + [tags](/reference/resource-configs/tags)="" | [""], + [alias](/reference/resource-configs/alias)="", + [pre_hook](/reference/resource-configs/pre-hook-post-hook)="" | [""], + [post_hook](/reference/resource-configs/pre-hook-post-hook)="" | [""] + [persist_docs](/reference/resource-configs/persist_docs)={} + [grants](/reference/resource-configs/grants)={} ) }} ``` @@ -166,7 +172,7 @@ snapshots: Snapshots can be configured in one of three ways: 1. Using a `config` block within a snapshot -2. Using a `config` [resource property](model-properties) in a `.yml` file +2. Using a `config` [resource property](/reference/model-properties) in a `.yml` file 3. From the `dbt_project.yml` file, under the `snapshots:` key. To apply a configuration to a snapshot, or directory of snapshots, define the resource path as nested dictionary keys. Snapshot configurations are applied hierarchically in the order above. diff --git a/website/docs/reference/snapshot-properties.md b/website/docs/reference/snapshot-properties.md index c4fa4cac218..301747e9325 100644 --- a/website/docs/reference/snapshot-properties.md +++ b/website/docs/reference/snapshot-properties.md @@ -1,14 +1,11 @@ --- title: Snapshot properties +description: "Read this guide to learn about using source properties in dbt." --- - - - **v0.21.0** introduced the `config` property, thereby allowing you to configure snapshots in all `.yml` files - - Snapshots properties can be declared in `.yml` files in: -- your `snapshots/` directory (as defined by the [`snapshot-paths` config](snapshot-paths)) -- your `models/` directory (as defined by the [`model-paths` config](model-paths)) +- your `snapshots/` directory (as defined by the [`snapshot-paths` config](/reference/project-configs/snapshot-paths)) +- your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)) We recommend that you put them in the `snapshots/` directory. You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `snapshots/` or `models/` directory. @@ -19,22 +16,22 @@ version: 2 snapshots: - name: - [description](description): - [meta](meta): {} + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} [docs](/reference/resource-configs/docs): show: true | false - [config](resource-properties/config): - [](snapshot-configs): - [tests](resource-properties/tests): + [config](/reference/resource-properties/config): + [](/reference/snapshot-configs): + [tests](/reference/resource-properties/tests): - - ... columns: - name: - [description](description): - [meta](meta): {} - [quote](quote): true | false - [tags](resource-configs/tags): [] - [tests](resource-properties/tests): + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [quote](/reference/resource-properties/quote): true | false + [tags](/reference/resource-configs/tags): [] + [tests](/reference/resource-properties/tests): - - ... # declare additional tests - ... # declare properties of additional columns @@ -43,9 +40,3 @@ snapshots: ``` - - - -* `v0.16.0`: The ability to declare snapshot properties was introduced. - - diff --git a/website/docs/reference/snowflake-permissions.md b/website/docs/reference/snowflake-permissions.md index 80dbec25cc8..6a469d12230 100644 --- a/website/docs/reference/snowflake-permissions.md +++ b/website/docs/reference/snowflake-permissions.md @@ -15,9 +15,11 @@ grant usage on schema database.an_existing_schema to role role_name; grant create table on schema database.an_existing_schema to role role_name; grant create view on schema database.an_existing_schema to role role_name; grant usage on future schemas in database database_name to role role_name; +grant monitor on future schemas in database database_name to role role_name; grant select on future tables in database database_name to role role_name; grant select on future views in database database_name to role role_name; grant usage on all schemas in database database_name to role role_name; +grant monitor on all schemas in database database_name to role role_name; grant select on all tables in database database_name to role role_name; grant select on all views in database database_name to role role_name; ``` diff --git a/website/docs/reference/source-configs.md b/website/docs/reference/source-configs.md index 4968593c0f2..3f9a19e78ca 100644 --- a/website/docs/reference/source-configs.md +++ b/website/docs/reference/source-configs.md @@ -1,11 +1,12 @@ --- title: Source configurations +description: "Learn how to use source configurations in dbt." id: source-configs --- ## Available configurations -Sources only support one configuration, [`enabled`](enabled). +Sources only support one configuration, [`enabled`](/reference/resource-configs/enabled). ### General configurations @@ -24,8 +25,8 @@ Sources only support one configuration, [`enabled`](enabled). ```yaml sources: - [](resource-path): - [+](plus-prefix)[enabled](enabled): true | false + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[enabled](/reference/resource-configs/enabled): true | false ``` @@ -36,8 +37,6 @@ sources: - - ```yaml @@ -45,43 +44,30 @@ version: 2 sources: - name: [] - [config](resource-properties/config): - [enabled](enabled): true | false + [config](/reference/resource-properties/config): + [enabled](/reference/resource-configs/enabled): true | false tables: - name: [] - [config](resource-properties/config): - [enabled](enabled): true | false + [config](/reference/resource-properties/config): + [enabled](/reference/resource-configs/enabled): true | false ``` - - ## Configuring sources - - Sources can be configured via a `config:` block within their `.yml` definitions, or from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](/docs/build/packages). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. - - - - -Sources can be configured from the `dbt_project.yml` file under the `sources:` key. This configuration is most useful for configuring sources imported from [a package](package-management). You can disable sources imported from a package to prevent them from rendering in the documentation, or to prevent [source freshness checks](/docs/build/sources#snapshotting-source-data-freshness) from running on source tables imported from packages. - -Unlike other resource types, sources do not yet support a `config` property. It is not possible to (re)define source configs hierarchically across multiple yaml files. - - ### Examples #### Disable all sources imported from a package To apply a configuration to all sources included from a [package](/docs/build/packages), -state your configuration under the [project name](project-configs/name.md) in the +state your configuration under the [project name](/reference/project-configs/name.md) in the `sources:` config as a part of the resource path. @@ -96,8 +82,6 @@ sources: - - #### Conditionally enable a single source When defining a source, you can disable the entire source, or specific source tables, using the inline `config` property: @@ -137,8 +121,6 @@ sources: - - #### Disable a single source from a package To disable a specific source from another package, qualify the resource path for your configuration with both a package name and a source name. In this case, we're disabling the `clickstream` source from the `events` package. diff --git a/website/docs/reference/source-properties.md b/website/docs/reference/source-properties.md index e4e9709ddc1..d20ef5f2877 100644 --- a/website/docs/reference/source-properties.md +++ b/website/docs/reference/source-properties.md @@ -1,13 +1,14 @@ --- -title: Source properties +title: "About source properties" +description: "Learn how to use source properties in dbt." --- ## Related documentation - [Using sources](/docs/build/sources) -- [Declaring resource properties](configs-and-properties) +- [Declaring resource properties](/reference/configs-and-properties) ## Overview -Source properties can be declared in `.yml` files in your `models/` directory (as defined by the [`model-paths` config](model-paths)). +Source properties can be declared in `.yml` files in your `models/` directory (as defined by the [`model-paths` config](/reference/project-configs/model-paths)). You can name these files `whatever_you_want.yml`, and nest them arbitrarily deeply in subfolders within the `models/` directory. @@ -18,67 +19,67 @@ version: 2 sources: - name: # required - [description](description): - [database](resource-properties/database): - [schema](resource-properties/schema): - [loader](loader): - [loaded_at_field](resource-properties/freshness#loaded_at_field): - [meta](meta): {} - [tags](resource-configs/tags): [] + [description](/reference/resource-properties/description): + [database](/reference/resource-properties/database): + [schema](/reference/resource-properties/schema): + [loader](/reference/resource-properties/loader): + [loaded_at_field](/reference/resource-properties/freshness#loaded_at_field): + [meta](/reference/resource-configs/meta): {} + [tags](/reference/resource-configs/tags): [] # requires v1.1+ - [config](resource-properties/config): + [config](/reference/resource-properties/config): [](source-configs): - [overrides](resource-properties/overrides): + [overrides](/reference/resource-properties/overrides): - [freshness](resource-properties/freshness): + [freshness](/reference/resource-properties/freshness): warn_after: - [count](resource-properties/freshness#count): - [period](resource-properties/freshness#period): minute | hour | day + [count](/reference/resource-properties/freshness#count): + [period](/reference/resource-properties/freshness#period): minute | hour | day error_after: - [count](resource-properties/freshness#count): - [period](resource-properties/freshness#period): minute | hour | day - [filter](resource-properties/freshness#filter): + [count](/reference/resource-properties/freshness#count): + [period](/reference/resource-properties/freshness#period): minute | hour | day + [filter](/reference/resource-properties/freshness#filter): - [quoting](resource-properties/quoting): + [quoting](/reference/resource-properties/quoting): database: true | false schema: true | false identifier: true | false tables: - name: #required - [description](description): - [meta](meta): {} - [identifier](identifier): - [loaded_at_field](resource-properties/freshness#loaded_at_field): - [tests](resource-properties/tests): + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [identifier](/reference/resource-properties/identifier): + [loaded_at_field](/reference/resource-properties/freshness#loaded_at_field): + [tests](/reference/resource-properties/tests): - - ... # declare additional tests - [tags](resource-configs/tags): [] - [freshness](resource-properties/freshness): + [tags](/reference/resource-configs/tags): [] + [freshness](/reference/resource-properties/freshness): warn_after: - [count](resource-properties/freshness#count): - [period](resource-properties/freshness#period): minute | hour | day + [count](/reference/resource-properties/freshness#count): + [period](/reference/resource-properties/freshness#period): minute | hour | day error_after: - [count](resource-properties/freshness#count): - [period](resource-properties/freshness#period): minute | hour | day - [filter](resource-properties/freshness#filter): + [count](/reference/resource-properties/freshness#count): + [period](/reference/resource-properties/freshness#period): minute | hour | day + [filter](/reference/resource-properties/freshness#filter): - [quoting](resource-properties/quoting): + [quoting](/reference/resource-properties/quoting): database: true | false schema: true | false identifier: true | false - [external](resource-properties/external): {} + [external](/reference/resource-properties/external): {} columns: - name: # required - [description](description): - [meta](meta): {} - [quote](quote): true | false - [tests](resource-properties/tests): + [description](/reference/resource-properties/description): + [meta](/reference/resource-configs/meta): {} + [quote](/reference/resource-properties/quote): true | false + [tests](/reference/resource-properties/tests): - - ... # declare additional tests - [tags](resource-configs/tags): [] + [tags](/reference/resource-configs/tags): [] - name: ... # declare properties of additional columns - name: ... # declare properties of additional source tables diff --git a/website/docs/reference/test-configs.md b/website/docs/reference/test-configs.md index 88255e62189..52f814b6ef1 100644 --- a/website/docs/reference/test-configs.md +++ b/website/docs/reference/test-configs.md @@ -1,25 +1,18 @@ --- title: Test configurations +description: "Read this guide to learn about using test configurations in dbt." --- ## Related documentation * [Tests](/docs/build/tests) - - -* `v0.20.0`: Introduced the ability to configure tests from `dbt_project.yml`, and to configure `enabled` for generic tests. Introduced `fail_calc`, `where`, `error_if`, `warn_if`, `store_failures`, and `where` configs. -* `v0.21.0`: Introduced the `config()` dictionary, making it easier and clearer to configure specific instances of generic tests - - - - Tests can be configured in a few different ways: -1. Properties within `.yml` definition (generic tests only, see [test properties](resource-properties/tests) for full syntax) +1. Properties within `.yml` definition (generic tests only, see [test properties](/reference/resource-properties/tests) for full syntax) 2. A `config()` block within the test's SQL definition 3. In `dbt_project.yml` -Test configs are applied hierarchically, in the order of specifity outlined above. In the case of a singular test, the `config()` block within the SQL definition takes precedence over configs in the project file. In the case of a specific instance of a generic test, the test's `.yml` properties would take precedence over any values set in its generic SQL definition's `config()`, which in turn would take precedence over values set in `dbt_project.yml`. +Test configs are applied hierarchically, in the order of specificity outlined above. In the case of a singular test, the `config()` block within the SQL definition takes precedence over configs in the project file. In the case of a specific instance of a generic test, the test's `.yml` properties would take precedence over any values set in its generic SQL definition's `config()`, which in turn would take precedence over values set in `dbt_project.yml`. ## Available configurations @@ -42,14 +35,14 @@ Click the link on each configuration option to read more about what it can do. ```yaml tests: - [](resource-path): - [+](plus-prefix)[fail_calc](fail_calc): - [+](plus-prefix)[limit](limit): - [+](plus-prefix)[severity](severity): error | warn - [+](plus-prefix)[error_if](severity): - [+](plus-prefix)[warn_if](severity): - [+](plus-prefix)[store_failures](store_failures): true | false - [+](plus-prefix)[where](where): + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[fail_calc](/reference/resource-configs/fail_calc): + [+](/reference/resource-configs/plus-prefix)[limit](/reference/resource-configs/limit): + [+](/reference/resource-configs/plus-prefix)[severity](/reference/resource-configs/severity): error | warn + [+](/reference/resource-configs/plus-prefix)[error_if](/reference/resource-configs/severity): + [+](/reference/resource-configs/plus-prefix)[warn_if](/reference/resource-configs/severity): + [+](/reference/resource-configs/plus-prefix)[store_failures](/reference/resource-configs/store_failures): true | false + [+](/reference/resource-configs/plus-prefix)[where](/reference/resource-configs/where): ``` @@ -63,13 +56,13 @@ tests: ```jinja {{ config( - [fail_calc](fail_calc) = "", - [limit](limit) = , - [severity](severity) = "error | warn", - [error_if](severity) = "", - [warn_if](severity) = "", - [store_failures](store_failures) = true | false, - [where](where) = "" + [fail_calc](/reference/resource-configs/fail_calc) = "", + [limit](/reference/resource-configs/limit) = , + [severity](/reference/resource-configs/severity) = "error | warn", + [error_if](/reference/resource-configs/severity) = "", + [warn_if](/reference/resource-configs/severity) = "", + [store_failures](/reference/resource-configs/store_failures) = true | false, + [where](/reference/resource-configs/where) = "" ) }} ``` @@ -87,28 +80,28 @@ version: 2 tests: - [](#test_name): : - [config](resource-properties/config): - [fail_calc](fail_calc): - [limit](limit): - [severity](severity): error | warn - [error_if](severity): - [warn_if](severity): - [store_failures](store_failures): true | false - [where](where): - - [columns](columns): + [config](/reference/resource-properties/config): + [fail_calc](/reference/resource-configs/fail_calc): + [limit](/reference/resource-configs/limit): + [severity](/reference/resource-configs/severity): error | warn + [error_if](/reference/resource-configs/severity): + [warn_if](/reference/resource-configs/severity): + [store_failures](/reference/resource-configs/store_failures): true | false + [where](/reference/resource-configs/where): + + [columns](/reference/resource-properties/columns): - name: tests: - [](#test_name): : - [config](resource-properties/config): - [fail_calc](fail_calc): - [limit](limit): - [severity](severity): error | warn - [error_if](severity): - [warn_if](severity): - [store_failures](store_failures): true | false - [where](where): + [config](/reference/resource-properties/config): + [fail_calc](/reference/resource-configs/fail_calc): + [limit](/reference/resource-configs/limit): + [severity](/reference/resource-configs/severity): error | warn + [error_if](/reference/resource-configs/severity): + [warn_if](/reference/resource-configs/severity): + [store_failures](/reference/resource-configs/store_failures): true | false + [where](/reference/resource-configs/where): ``` This configuration mechanism is supported for specific instances of generic tests only. To configure a specific singular test, you should use the `config()` macro in its SQL definition. @@ -137,14 +130,14 @@ This configuration mechanism is supported for specific instances of generic test ```yaml tests: - [](resource-path): - [+](plus-prefix)[enabled](enabled): true | false - [+](plus-prefix)[tags](resource-configs/tags): | [] - [+](plus-prefix)[meta](resource-configs/meta): {dictionary} - # relevant for [store_failures](resource-configs/store_failures) only - [+](plus-prefix)[database](resource-configs/database): - [+](plus-prefix)[schema](resource-configs/schema): - [+](plus-prefix)[alias](resource-configs/alias): + [](/reference/resource-configs/resource-path): + [+](/reference/resource-configs/plus-prefix)[enabled](/reference/resource-configs/enabled): true | false + [+](/reference/resource-configs/plus-prefix)[tags](/reference/resource-configs/tags): | [] + [+](/reference/resource-configs/plus-prefix)[meta](/reference/resource-configs/meta): {dictionary} + # relevant for [store_failures](/reference/resource-configs/store_failures) only + [+](/reference/resource-configs/plus-prefix)[database](/reference/resource-configs/database): + [+](/reference/resource-configs/plus-prefix)[schema](/reference/resource-properties/schema): + [+](/reference/resource-configs/plus-prefix)[alias](/reference/resource-configs/alias): ``` @@ -156,12 +149,12 @@ tests: ```jinja {{ config( - [enabled](enabled)=true | false, - [tags](resource-configs/tags)="" | [""] - [meta](resource-configs/meta)={dictionary}, - [database](resource-configs/database)="", - [schema](resource-configs/schema)="", - [alias](resource-configs/alias)="", + [enabled](/reference/resource-configs/enabled)=true | false, + [tags](/reference/resource-configs/tags)="" | [""] + [meta](/reference/resource-configs/meta)={dictionary}, + [database](/reference/resource-configs/database)="", + [schema](/reference/resource-properties/schema)="", + [alias](/reference/resource-configs/alias)="", ) }} ``` @@ -178,28 +171,28 @@ version: 2 tests: - [](#test_name): : - [config](resource-properties/config): - [enabled](enabled): true | false - [tags](resource-configs/tags): | [] - [meta](resource-configs/meta): {dictionary} - # relevant for [store_failures](resource-configs/store_failures) only - [database](resource-configs/database): - [schema](resource-configs/schema): - [alias](resource-configs/alias): - - [columns](columns): + [config](/reference/resource-properties/config): + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [meta](/reference/resource-configs/meta): {dictionary} + # relevant for [store_failures](/reference/resource-configs/store_failures) only + [database](/reference/resource-configs/database): + [schema](/reference/resource-properties/schema): + [alias](/reference/resource-configs/alias): + + [columns](/reference/resource-properties/columns): - name: tests: - [](#test_name): : - [config](resource-properties/config): - [enabled](enabled): true | false - [tags](resource-configs/tags): | [] - [meta](resource-configs/meta): {dictionary} - # relevant for [store_failures](resource-configs/store_failures) only - [database](resource-configs/database): - [schema](resource-configs/schema): - [alias](resource-configs/alias): + [config](/reference/resource-properties/config): + [enabled](/reference/resource-configs/enabled): true | false + [tags](/reference/resource-configs/tags): | [] + [meta](/reference/resource-configs/meta): {dictionary} + # relevant for [store_failures](/reference/resource-configs/store_failures) only + [database](/reference/resource-configs/database): + [schema](/reference/resource-properties/schema): + [alias](/reference/resource-configs/alias): ``` This configuration mechanism is supported for specific instances of generic tests only. To configure a specific singular test, you should use the `config()` macro in its SQL definition. diff --git a/website/docs/reference/warehouse-setups/databricks-setup.md b/website/docs/reference/warehouse-setups/databricks-setup.md deleted file mode 100644 index 86439aa2484..00000000000 --- a/website/docs/reference/warehouse-setups/databricks-setup.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: "Databricks setup" -id: "databricks-setup" -meta: - maintained_by: Databricks - authors: 'some dbt loving Bricksters' - github_repo: 'databricks/dbt-databricks' - pypi_package: 'dbt-databricks' - min_core_version: 'v0.18.0' - cloud_support: Coming Soon - min_supported_version: 'n/a' - slack_channel_name: '#db-databricks-and-spark' - slack_channel_link: 'https://getdbt.slack.com/archives/CNGCW8HKL' - platform_name: 'Databricks' - config_page: 'spark-configs' ---- - -

          Overview of {frontMatter.meta.pypi_package}

          - -
            -
          • Maintained by: {frontMatter.meta.maintained_by}
          • -
          • Authors: {frontMatter.meta.authors}
          • -
          • GitHub repo: {frontMatter.meta.github_repo}
          • -
          • PyPI package: {frontMatter.meta.pypi_package}
          • -
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • -
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • -
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • -
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • -
          - -## Installation and Distribution - - -

          Installing {frontMatter.meta.pypi_package}

          - -pip is the easiest way to install the adapter: - -pip install {frontMatter.meta.pypi_package} - -

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          - -

          Configuring {frontMatter.meta.pypi_package}

          - -

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          - -

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          - - -### Set up a Databricks Target - -dbt-databricks can connect to Databricks all-purpose clusters as well as SQL endpoints. -The latter provides an opinionated way of running SQL workloads with optimal performance and price; the former provides all the flexibility of Spark. - - - -```yaml -your_profile_name: - target: dev - outputs: - dev: - type: databricks - catalog: [optional catalog name, if you are using Unity Catalog, is only available in dbt-databricks>=1.1.1] - schema: [schema name] - host: [yourorg.databrickshost.com] - http_path: [/sql/your/http/path] - token: [dapiXXXXXXXXXXXXXXXXXXXXXXX] # Personal Access Token (PAT) - threads: [1 or more] # optional, default 1 -``` - - - -See the [Databricks documentation](https://docs.databricks.com/dev-tools/dbt.html#) on how -to obtain the credentials for configuring your profile. - -## Caveats - -### Supported Functionality - -Most dbt Core functionality is supported, but some features are only available -on Delta Lake. - -Delta-only features: -1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](spark-configs#the-merge-strategy)) -2. [Snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots) - -### Choosing between dbt-databricks and dbt-spark - -While `dbt-spark` can be used to connect to Databricks, `dbt-databricks` was created to make it -even easier to use dbt with the Databricks Lakehouse. - -`dbt-databricks` includes: -- No need to install additional drivers or dependencies for use on the CLI -- Use of Delta Lake for all models out of the box -- SQL macros that are optimized to run with [Photon](https://docs.databricks.com/runtime/photon.html) - -### Support for Unity Catalog - -The adapter `dbt-databricks>=1.1.1` supports the 3-level namespace of Unity Catalog (catalog / schema / relations) so you can organize and secure your data the way you like. diff --git a/website/docs/reference/warehouse-setups/glue-setup.md b/website/docs/reference/warehouse-setups/glue-setup.md deleted file mode 100644 index 93af196cf01..00000000000 --- a/website/docs/reference/warehouse-setups/glue-setup.md +++ /dev/null @@ -1,273 +0,0 @@ ---- -title: "AWS Glue Setup" -id: "glue-setup" -meta: - maintained_by: Community - authors: 'Benjamin Menuet, Moshir Mikael, Armando Segnini and Amine El Mallem' - github_repo: 'aws-samples/dbt-glue' - pypi_package: 'dbt-glue' - min_core_version: 'v0.24.0' - cloud_support: Not Supported - min_supported_version: 'Glue 2.0' - slack_channel_name: '#db-glue' - slack_channel_link: 'https://getdbt.slack.com/archives/C02R4HSMBAT' - platform_name: 'AWS Glue' - config_page: 'glue-configs' ---- - -:::info Community plugin - -Some core functionality may be limited. If you're interested in contributing, check out the source code for each repository listed below. - -::: - -

          Overview of {frontMatter.meta.pypi_package}

          - -
            -
          • Maintained by: {frontMatter.meta.maintained_by}
          • -
          • Authors: {frontMatter.meta.authors}
          • -
          • GitHub repo: {frontMatter.meta.github_repo}
          • -
          • PyPI package: {frontMatter.meta.pypi_package}
          • -
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • -
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • -
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • -
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • -
          - - -

          Installing {frontMatter.meta.pypi_package}

          - -pip is the easiest way to install the adapter: - -pip install {frontMatter.meta.pypi_package} - -

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          - -

          Configuring {frontMatter.meta.pypi_package}

          - -

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          - -

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          - -For further (and more likely up-to-date) info, see the [README](https://github.com/aws-samples/dbt-glue#readme) - - -## Connection Methods - - -### Configuring your AWS profile for Glue Interactive Session -There are two IAM principals used with interactive sessions. -- Client principal: The princpal (either user or role) calling the AWS APIs (Glue, Lake Formation, Interactive Sessions) -from the local client. This is the principal configured in the AWS CLI and likely the same. -- Service role: The IAM role that AWS Glue uses to execute your session. This is the same as AWS Glue -ETL. - -Read [this documentation](https://docs.aws.amazon.com/glue/latest/dg/glue-is-security.html) to configure these principals. - - -You will find bellow a least privileged policy to enjoy all features of **`dbt-glue`** adapter. - -Please to update variables between **`<>`**, here are explanations of these arguments: - -|Args |Description | -|---|---| -|region|The region where your Glue database is stored | -|AWS Account|The AWS account where you run your pipeline| -|dbt output database|The database updated by dbt (this is the database configured in the profile.yml of your dbt environment)| -|dbt source database|All databases used as source| -|dbt output bucket|The bucket name where the data will be generated by dbt (the location configured in the profile.yml of your dbt environment)| -|dbt source bucket|The bucket name of source databases (if they are not managed by Lake Formation)| - - - -```yml -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "Read_and_write_databases", - "Action": [ - "glue:SearchTables", - "glue:BatchCreatePartition", - "glue:CreatePartitionIndex", - "glue:DeleteDatabase", - "glue:GetTableVersions", - "glue:GetPartitions", - "glue:DeleteTableVersion", - "glue:UpdateTable", - "glue:DeleteTable", - "glue:DeletePartitionIndex", - "glue:GetTableVersion", - "glue:UpdateColumnStatisticsForTable", - "glue:CreatePartition", - "glue:UpdateDatabase", - "glue:CreateTable", - "glue:GetTables", - "glue:GetDatabases", - "glue:GetTable", - "glue:GetDatabase", - "glue:GetPartition", - "glue:UpdateColumnStatisticsForPartition", - "glue:CreateDatabase", - "glue:BatchDeleteTableVersion", - "glue:BatchDeleteTable", - "glue:DeletePartition", - "lakeformation:ListResources", - "lakeformation:BatchGrantPermissions", - "lakeformation:ListPermissions" - ], - "Resource": [ - "arn:aws:glue:::catalog", - "arn:aws:glue:::table//*", - "arn:aws:glue:::database/" - ], - "Effect": "Allow" - }, - { - "Sid": "Read_only_databases", - "Action": [ - "glue:SearchTables", - "glue:GetTableVersions", - "glue:GetPartitions", - "glue:GetTableVersion", - "glue:GetTables", - "glue:GetDatabases", - "glue:GetTable", - "glue:GetDatabase", - "glue:GetPartition", - "lakeformation:ListResources", - "lakeformation:ListPermissions" - ], - "Resource": [ - "arn:aws:glue:::table//*", - "arn:aws:glue:::database/", - "arn:aws:glue:::database/default", - "arn:aws:glue:::database/global_temp" - ], - "Effect": "Allow" - }, - { - "Sid": "Storage_all_buckets", - "Action": [ - "s3:GetBucketLocation", - "s3:ListBucket" - ], - "Resource": [ - "arn:aws:s3:::", - "arn:aws:s3:::" - ], - "Effect": "Allow" - }, - { - "Sid": "Read_and_write_buckets", - "Action": [ - "s3:PutObject", - "s3:PutObjectAcl", - "s3:GetObject", - "s3:DeleteObject" - ], - "Resource": [ - "arn:aws:s3:::" - ], - "Effect": "Allow" - }, - { - "Sid": "Read_only_buckets", - "Action": [ - "s3:GetObject" - ], - "Resource": [ - "arn:aws:s3:::" - ], - "Effect": "Allow" - } - ] -} -``` - - -### Configuration of the local environment - -Because **`dbt`** and **`dbt-glue`** adapter are compatible with Python versions 3.7, 3.8, and 3.9, check the version of Python: - -```bash -$ python3 --version -``` - -Configure a Python virtual environment to isolate package version and code dependencies: - -```bash -$ sudo yum install git -$ python3 -m venv dbt_venv -$ source dbt_venv/bin/activate -$ python3 -m pip install --upgrade pip -``` - -Configure the last version of AWS CLI - -```bash -$ curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" -$ unzip awscliv2.zip -$ sudo ./aws/install -``` - -Configure the aws-glue-session package - -```bash -$ sudo yum install gcc krb5-devel.x86_64 python3-devel.x86_64 -y -$ pip3 install —upgrade boto3 -$ pip3 install —upgrade aws-glue-sessions -``` - -### Example config - - -```yml -type: glue -query-comment: This is a glue dbt example -role_arn: arn:aws:iam::1234567890:role/GlueInteractiveSessionRole -region: us-east-1 -workers: 2 -worker_type: G.1X -idle_timeout: 10 -schema: "dbt_demo" -database: "dbt_demo" -session_provisioning_timeout_in_seconds: 120 -location: "s3://dbt_demo_bucket/dbt_demo_data" -``` - - - -The table below describes all the options. - -|Option |Description | Mandatory | -|---|---|---| -|project_name |The dbt project name. This must be the same as the one configured in the dbt project. |yes| -|type |The driver to use. |yes| -|query-comment |A string to inject as a comment in each query that dbt runs. |no| -|role_arn |The ARN of the interactive session role created as part of the CloudFormation template. |yes| -|region |The AWS Region where you run the data pipeline. |yes| -|workers |The number of workers of a defined workerType that are allocated when a job runs. |yes| -|worker_type |The type of predefined worker that is allocated when a job runs. Accepts a value of Standard, G.1X, or G.2X. |yes| -|schema |The schema used to organize data stored in Amazon S3. |yes| -|database |The database in Lake Formation. The database stores metadata tables in the Data Catalog. |yes| -|session_provisioning_timeout_in_seconds |The timeout in seconds for AWS Glue interactive session provisioning. |yes| -|location |The Amazon S3 location of your target data. |yes| -|idle_timeout |The AWS Glue session idle timeout in minutes. (The session stops after being idle for the specified amount of time.) |no| -|glue_version |The version of AWS Glue for this session to use. Currently, the only valid options are 2.0 and 3.0. The default value is 2.0. |no| -|security_configuration |The security configuration to use with this session. |no| -|connections |A comma-separated list of connections to use in the session. |no| - - -## Caveats - -### Supported Functionality - -Most dbt Core functionality is supported, but some features are only available with Apache Hudi. - -Apache Hudi-only features: -1. Incremental model updates by `unique_key` instead of `partition_by` (see [`merge` strategy](glue-configs#the-merge-strategy)) - -Some dbt features, available on the core adapters, are not yet supported on Glue: -1. [Persisting](persist_docs) column-level descriptions as database comments -2. [Snapshots](snapshots) diff --git a/website/docs/reference/warehouse-setups/redshift-setup.md b/website/docs/reference/warehouse-setups/redshift-setup.md deleted file mode 100644 index 28276594ba8..00000000000 --- a/website/docs/reference/warehouse-setups/redshift-setup.md +++ /dev/null @@ -1,145 +0,0 @@ ---- -title: "Redshift setup" -id: "redshift-setup" -meta: - maintained_by: dbt Labs - authors: 'core dbt maintainers' - github_repo: 'dbt-labs/dbt-redshift' - pypi_package: 'dbt-redshift' - min_core_version: 'v0.10.0' - cloud_support: Supported - min_supported_version: 'n/a' - slack_channel_name: '#db-redshift' - slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' - platform_name: 'Redshift' - config_page: 'redshift-configs' ---- - -

          Overview of {frontMatter.meta.pypi_package}

          - -
            -
          • Maintained by: {frontMatter.meta.maintained_by}
          • -
          • Authors: {frontMatter.meta.authors}
          • -
          • GitHub repo: {frontMatter.meta.github_repo}
          • -
          • PyPI package: {frontMatter.meta.pypi_package}
          • -
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • -
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • -
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • -
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • -
          - - -

          Installing {frontMatter.meta.pypi_package}

          - -pip is the easiest way to install the adapter: - -pip install {frontMatter.meta.pypi_package} - -

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          - -

          Configuring {frontMatter.meta.pypi_package}

          - -

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          - -

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          - - -## Authentication Methods - -### Password-based authentication - - - -```yaml -company-name: - target: dev - outputs: - dev: - type: redshift - host: hostname.region.redshift.amazonaws.com - user: username - password: password1 - port: 5439 - dbname: analytics - schema: analytics - threads: 4 - keepalives_idle: 240 # default 240 seconds - connect_timeout: 10 # default 10 seconds - # search_path: public # optional, not recommended - sslmode: [optional, set the sslmode used to connect to the database (in case this parameter is set, will look for ca in ~/.postgresql/root.crt)] - ra3_node: true # enables cross-database sources -``` - - - -### IAM Authentication - -To set up a Redshift profile using IAM Authentication, set the `method` -parameter to `iam` as shown below. Note that a password is not required when -using IAM Authentication. For more information on this type of authentication, -consult the [Redshift Documentation](https://docs.aws.amazon.com/redshift/latest/mgmt/generating-user-credentials.html) -and [boto3 -docs](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/redshift.html#Redshift.Client.get_cluster_credentials) -on generating user credentials with IAM Auth. - -If you receive the "You must specify a region" error when using IAM -Authentication, then your aws credentials are likely misconfigured. Try running -`aws configure` to set up AWS access keys, and pick a default region. If you have any questions, -please refer to the official AWS documentation on [Configuration and credential file settings](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-files.html). - - - -```yaml -my-redshift-db: - target: dev - outputs: - dev: - type: redshift - method: iam - cluster_id: CLUSTER_ID - host: hostname.region.redshift.amazonaws.com - user: alice - iam_profile: data_engineer # optional - iam_duration_seconds: 900 # optional - autocreate: true # optional - db_groups: ['ANALYSTS'] # optional - - # Other Redshift configs: - port: 5439 - dbname: analytics - schema: analytics - threads: 4 - [keepalives_idle](#keepalives_idle): 240 # default 240 seconds - connect_timeout: 10 # default 10 seconds - [retries](#retries): 1 # default 1 retry on error/timeout when opening connections - # search_path: public # optional, but not recommended - sslmode: [optional, set the sslmode used to connect to the database (in case this parameter is set, will look for ca in ~/.postgresql/root.crt)] - ra3_node: true # enables cross-database sources - -``` - - - -### Specifying an IAM Profile - -:::info New in dbt v0.18.0 -The `iam_profile` config option for Redshift profiles is new in dbt v0.18.0 -::: - -When the `iam_profile` configuration is set, dbt will use the specified profile from your `~/.aws/config` file instead of using the profile name `default` -## Redshift notes -### `sort` and `dist` keys -Where possible, dbt enables the use of `sort` and `dist` keys. See the section on [Redshift specific configurations](redshift-configs). - -### `keepalives_idle` -If the database closes its connection while dbt is waiting for data, you may see the error `SSL SYSCALL error: EOF detected`. Lowering the [`keepalives_idle` value](https://www.postgresql.org/docs/9.3/libpq-connect.html) may prevent this, because the server will send a ping to keep the connection active more frequently. - -[dbt's default setting](https://github.com/dbt-labs/dbt-redshift/blob/main/dbt/adapters/redshift/connections.py#L51) is 240 (seconds), but can be configured lower (perhaps 120 or 60), at the cost of a chattier network connection. - - - -#### retries - -If `dbt-redshift` encounters an operational error or timeout when opening a new connection, it will retry up to the number of times configured by `retries`. If set to 2+ retries, dbt will wait 1 second before retrying. The default value is 1 retry. If set to 0, dbt will not retry at all. - - diff --git a/website/docs/reference/warehouse-setups/trino-setup.md b/website/docs/reference/warehouse-setups/trino-setup.md deleted file mode 100644 index 7939f4d5d7d..00000000000 --- a/website/docs/reference/warehouse-setups/trino-setup.md +++ /dev/null @@ -1,119 +0,0 @@ ---- -title: "Starburst & Trino setup" -id: "trino-setup" -meta: - maintained_by: Starburst Data, Inc. - authors: Matthew Carter, Andy Regan, Andrew Hedengren - github_repo: 'starburstdata/dbt-trino' - pypi_package: 'dbt-trino' - min_core_version: 'v0.20.0' - cloud_support: Not Supported - min_supported_version: 'n/a' - slack_channel_name: '#db-clickhouse' - slack_channel_link: 'https://getdbt.slack.com/archives/C01DRQ178LQ' - platform_name: 'Trino' - config_page: 'no-configs' ---- - -:::info Vendor-supported plugin - -Certain core functionality may vary. If you would like to report a bug, request a feature, or contribute, you can check out the linked repository and open an issue. - -::: - -

          Overview of {frontMatter.meta.pypi_package}

          - -
            -
          • Maintained by: {frontMatter.meta.maintained_by}
          • -
          • Authors: {frontMatter.meta.authors}
          • -
          • GitHub repo: {frontMatter.meta.github_repo}
          • -
          • PyPI package: {frontMatter.meta.pypi_package}
          • -
          • Slack channel: {frontMatter.meta.slack_channel_name}
          • -
          • Supported dbt Core version: {frontMatter.meta.min_core_version} and newer
          • -
          • dbt Cloud support: {frontMatter.meta.cloud_support}
          • -
          • Minimum data platform version: {frontMatter.meta.min_supported_version}
          • -
          - - -

          Installing {frontMatter.meta.pypi_package}

          - -pip is the easiest way to install the adapter: - -pip install {frontMatter.meta.pypi_package} - -

          Installing {frontMatter.meta.pypi_package} will also install dbt-core and any other dependencies.

          - -

          Configuring {frontMatter.meta.pypi_package}

          - -

          For {frontMatter.meta.platform_name}-specifc configuration please refer to {frontMatter.meta.platform_name} Configuration

          - -

          For further info, refer to the GitHub repository: {frontMatter.meta.github_repo}

          - - - -## Set up a Trino or Starburst Target - -Trino or Starburst targets should be set up using the following configuration in your `profiles.yml` file. - -See all possible profile configuration options [here](#configuration). - - - -```yaml -trino: - target: dev - outputs: - dev: - type: trino - method: none # optional, one of {none | ldap | kerberos | oauth | jwt | certificate} - user: [user] - password: [password] # required if method is ldap or kerberos - database: [database name] - host: [hostname] - port: [port number] - schema: [your dbt schema] - threads: [1 or more] - retries: [1 or more] # default: 3 - http_scheme: [http or https] - session_properties: - [some_session_property]: [value] # run SHOW SESSION query to get current session properties -``` - - - -## Incremental models - -Incremental strategies supported by the adapter are: - -- append (default incremental strategy) - append only adds the new records based on the condition specified in the is_incremental() conditional block. -- delete+insert - Through the delete+insert incremental strategy, you can instruct dbt to use a two-step incremental approach. It will first delete the records detected through the configured is_incremental() block and re-insert them. -- merge - Through the merge incremental strategy, dbt-trino constructs a MERGE statement which inserts new and updates existing records based on the unique key (specified by unique_key). -If your unique_key is not actually unique, the delete+insert strategy can be used instead. Note that some connectors in Trino have limited or no support for MERGE. - -## Configuration - -A dbt-trino profile can be configured to run against Trino or Starburst using the following configuration: - -| Option | Description | Required? | Example | -|--------------------------------|--------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------|----------------------------------| -| method | The Trino authentication method to use | Optional (default is `none`, supported methods are `ldap`, `kerberos`, `jwt`, `oauth` or `certificate`) | `none` or `kerberos` | -| user | Username for authentication | Optional (required if `method` is `none`, `ldap` or `kerberos`) | `commander` | -| password | Password for authentication | Optional (required if `method` is `ldap`) | `none` or `abc123` | -| keytab | Path to keytab for kerberos authentication | Optional (may be required if `method` is `kerberos`) | `/tmp/trino.keytab` | -| krb5_config | Path to config for kerberos authentication | Optional (may be required if `method` is `kerberos`) | `/tmp/krb5.conf` | -| principal | Principal for kerberos authentication | Optional (may be required if `method` is `kerberos`) | `trino@EXAMPLE.COM` | -| service_name | Service name for kerberos authentication | Optional (default is `trino`) | `abc123` | -| jwt_token | JWT token for authentication | Optional (required if `method` is `jwt`) | `none` or `abc123` | -| client_certificate | Path to client certificate to be used for certificate based authentication | Optional (required if `method` is `certificate`) | `/tmp/tls.crt` | -| client_private_key | Path to client private key to be used for certificate based authentication | Optional (required if `method` is `certificate`) | `/tmp/tls.key` | -| http_headers | HTTP Headers to send alongside requests to Trino, specified as a yaml dictionary of (header, value) pairs. | Optional | `X-Trino-Client-Info: dbt-trino` | -| http_scheme | The HTTP scheme to use for requests to Trino | Optional (default is `http`, or `https` for `method: kerberos`, `ldap` or `jwt`) | `https` or `http` | -| cert | The full path to a certificate file for authentication with trino | Optional | | -| session_properties | Sets Trino session properties used in the connection | Optional | `query_max_run_time: 5d` | -| database | Specify the database to build models into | Required | `analytics` | -| schema | Specify the schema to build models into. Note: it is not recommended to use upper or mixed case schema names | Required | `public` | -| host | The hostname to connect to | Required | `127.0.0.1` | -| port | The port to connect to the host on | Required | `8080` | -| threads | How many threads dbt should use | Optional (default is `1`) | `8` | -| prepared_statements_enabled | Enable usage of Trino prepared statements (used in `dbt seed` commands) | Optional (default is `true`) | `true` or `false` | -| retries | Configure how many times a database operation is retried when connection issues arise | Optional (default is `3`) diff --git a/website/docs/sql-reference/aggregate-functions/sql-array-agg.md b/website/docs/sql-reference/aggregate-functions/sql-array-agg.md new file mode 100644 index 00000000000..430be4b4316 --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-array-agg.md @@ -0,0 +1,62 @@ +--- +id: array-agg +title: SQL ARRAY_AGG +description: The ARRAY_AGG function allows you to create an array of multiple data values in SQL. +slug: /sql-reference/array-agg +--- + + + Working with the SQL ARRAY_AGG function + + +In any typical programming language such as Python or Javascript, arrays are typically innate and bountiful; when you’re processing data in SQL, arrays are a little less common but are a handy way to provide more structure to your data. + +To create an array of multiple data values in SQL, you’ll likely leverage the ARRAY_AGG function (short for *array aggregation*), which puts your input column values into an array. + +## How to use SQL ARRAY_AGG + +The ARRAY_AGG function has the following syntax: + +`array_agg( [distinct] ) [within group () over ([partition by ])` + +A few notes on the functionality of this function: +- Most of the example syntax from above is optional, meaning the ARRAY_AGG function can be as simple as `array_agg()` or used as a more complex as a window function +- [DISTINCT](/sql-reference/distinct) is an optional argument that can be passed in, so only distinct values are in the return array +- If input column is empty, the returning array will also be empty +- Since the ARRAY_AGG is an aggregate function (gasp!), you’ll need a GROUP BY statement at the end of your query if you’re grouping by certain field +- ARRAY_AGG and similar aggregate functions can become inefficient or costly to compute on large datasets, so use ARRAY_AGG wisely and truly understand your use cases for having arrays in your datasets + +Let’s dive into a practical example using the ARRAY_AGG function. + +### SQL ARRAY_AGG example + +```sql +select + date_trunc('month', order_date) as order_month, + array_agg(distinct status) as status_array +from {{ ref('orders') }} +group by 1 +order by 1 +``` + +This simple query using the sample dataset [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table is returning a new column of distinct order statuses by order month: + +| order_month | status_array | +|:---:|:---:| +| 2018-01-01 | [ "returned", "completed", "return_pending" ] | +| 2018-02-01 | [ "completed", "return_pending" ] | +| 2018-03-01 | [ "completed", "shipped", "placed" ] | +| 2018-04-01 | [ "placed" ] | + +Looking at the query results—this makes sense! We’d expect newer orders to likely not have any returns, and older orders to have completed returns. + +## SQL ARRAY_AGG syntax in Snowflake, Databricks, BigQuery, and Redshift + +[Snowflake](https://docs.snowflake.com/en/sql-reference/functions/array_agg.html), [Databricks](https://docs.databricks.com/sql/language-manual/functions/array_agg.html), and [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/aggregate_functions#array_agg) all support the ARRAY_AGG function. Redshift, however, supports an out-of-the-box [LISTAGG function](https://docs.aws.amazon.com/redshift/latest/dg/r_LISTAGG.html) that can perform similar functionality to ARRAY_AGG. The primary difference is that LISTAGG allows you to explicitly choose a delimiter to separate a list whereas arrays are naturally delimited by commas. + +## ARRAY_AGG use cases + +There are definitely too many use cases to list out for using the ARRAY_AGG function in your dbt models, but it’s very likely that ARRAY_AGG is used pretty downstream in your since you likely don’t want your data so bundled up earlier in your DAG to improve modularity and dryness. A few downstream use cases for ARRAY_AGG: + +- In [`export_` models](https://www.getdbt.com/open-source-data-culture/reverse-etl-playbook) that are used to send data to platforms using a tool to pair down multiple rows into a single row. Some downstream platforms, for example, require certain values that we’d usually keep as separate rows to be one singular row per customer or user. ARRAY_AGG is handy to bring multiple column values together by a singular id, such as creating an array of all items a user has ever purchased and sending that array downstream to an email platform to create a custom email campaign. +- Similar to export models, you may see ARRAY_AGG used in [mart tables](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to create final aggregate arrays per a singular dimension; performance concerns of ARRAY_AGG in these likely larger tables can potentially be bypassed with use of [incremental models in dbt](https://docs.getdbt.com/docs/build/incremental-models). diff --git a/website/docs/sql-reference/aggregate-functions/sql-avg.md b/website/docs/sql-reference/aggregate-functions/sql-avg.md new file mode 100644 index 00000000000..d7d2fccc3c4 --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-avg.md @@ -0,0 +1,55 @@ +--- +id: avg +title: SQL AVG +description: The AVG function is used to calculate the simple average of a numeric column, but you may also see it used in a window function to calculate rolling averages. +slug: /sql-reference/avg +--- + + + Working with the SQL AVG function + + +You’re a data person, so we assume you’re going to be calculating averages of some metrics \**waves hands airily*\* at some point in your career. And the way to calculate averages of a numeric column in SQL is by using the AVG function. + +## How to use the AVG function + +The AVG function is a part of the group of mathematical or aggregate functions (ex. MIN, MAX, SUM) that are often used in SQL to summarize datasets. You’ll most likely see the AVG function used to straightforwardly calculate the average of a numeric column, but you may also see it used in a window function to calculate rolling averages. + +### AVG function example + +```sql +select + date_trunc('month', order_date) as order_month, + round(avg(amount)) as avg_order_amount +from {{ ref('orders') }} +where status not in ('returned', 'return_pending') +group by 1 +``` + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +This query using the Jaffle Shop’s `orders` table will return the rounded order amount per each order month: + +| order_month | avg_order_amount | +|:---:|:---:| +| 2018-01-01 | 18 | +| 2018-02-01 | 15 | +| 2018-03-01 | 18 | +| 2018-04-01 | 17 | + +The AVG function, like many other mathematical functions, is an aggregate function. Aggregate functions operate across all rows, or a group of rows, to return a singular value. When calculating the average of a column across a dimension (or group of dimensions)—in our example above, `order_month`—you need a GROUP BY statement; the query above would not successfully run without it. + +## SQL AVG function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the ability to take the average of a column value and the syntax for the AVG functions is the same across all of those data platforms. + +## AVG function use cases + +We most commonly see the AVG function used in data work to calculate: +- The average of key metrics (ex. Average CSAT, average lead time, average order amount) in downstream [fact or dim models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) +- Rolling or moving averages (ex. 7-day, 30-day averages for key metrics) using window functions +- Averages in [dbt metrics](https://docs.getdbt.com/docs/build/metrics) + +This isn’t an extensive list of where your team may be using the AVG function throughout your dbt models and BI tool logic, but contains some common scenarios analytics engineers face in their day-to-day. diff --git a/website/docs/sql-reference/aggregate-functions/sql-count.md b/website/docs/sql-reference/aggregate-functions/sql-count.md new file mode 100644 index 00000000000..42ece4b124f --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-count.md @@ -0,0 +1,65 @@ +--- +id: count +title: SQL COUNT +description: COUNT is an aggregate function that is used to return the count of rows of a specified field or all rows in a dataset. It is commonly used to get baseline statistical information of a dataset, help ensure primary keys are unique, and calculate business metrics. +slug: /sql-reference/count +--- + + + Working with the SQL COUNT function + + +COUNT is a SQL function you need to know how to use. Whether it’s in an ad hoc query, a data model, or in a BI tool calculation, you’ll be using the SQL COUNT function countless times (pun intended) in your data work. + +To formalize it, COUNT is an aggregate function that is used to return the count of rows of a specified field (`count()`) or all rows in a dataset (`count(*)`). It is commonly used to get baseline statistical information of a dataset, help ensure primary keys are unique, and calculate business metrics. + +## How to use SQL COUNT in a query + +Use the following syntax to generate the aggregate count of a field: + +`count()` + +Since COUNT is an aggregate function, you’ll need a GROUP BY statement in your query if you’re looking at counts broken out by dimension(s). If you’re calculating the standalone counts of fields without the need to break them down by another field, you don’t need a GROUP BY statement. + +Let’s take a look at a practical example using COUNT, DISTINCT, and GROUP BY below. + +### COUNT example + +```sql +select + date_part('month', order_date) as order_month, + count(order_id) as count_all_orders, + count(distinct(customer_id)) as count_distinct_customers +from {{ ref('orders') }} +group by 1 +``` + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +This simple query is something you may do while doing initial exploration of your data; it will return the count of `order_ids` and count of distinct `customer_ids` per order month that appear in the Jaffle Shop’s `orders` table: + +| order_month | count_all_orders | count_distinct_customers | +|:---:|:---:|:---:| +| 1 | 29 | 24 | +| 2 | 27 | 25 | +| 3 | 35 | 31 | +| 4 | 8 | 8 | + +An analyst or analytics engineer may want to perform a query like this to understand the ratio of orders to customers and see how it changes seasonally. + +## SQL COUNT syntax in Snowflake, Databricks, BigQuery, and Redshift + +All modern data warehouses support the ability to use the COUNT function (and follow the same syntax!). + +Some data warehouses, such as Snowflake and Google BigQuery, additionally support a COUNT_IF/COUNTIF function that allows you to pass in a boolean expression to determine whether to count a row or not. + +## COUNT use cases + +We most commonly see queries using COUNT to: +- Perform initial data exploration on a dataset to understand dataset volume, primary key uniqueness, distribution of column values, and more. +- Calculate the counts of key business metrics (daily orders, customers created, etc.) in your data models or BI tool. +- Define [dbt metrics](/docs/build/metrics) to aggregate key metrics. + +This isn’t an extensive list of where your team may be using COUNT throughout your development work, dbt models, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/aggregate-functions/sql-max.md b/website/docs/sql-reference/aggregate-functions/sql-max.md new file mode 100644 index 00000000000..0b5dc5521ea --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-max.md @@ -0,0 +1,62 @@ +--- +id: max +title: SQL MAX +description: The SQL MAX aggregate function allows you to compute the maximum value from a column. +slug: /sql-reference/max +--- + + + Working with the SQL MAX function + + +The SQL MAX aggregate function allows you to compute the maximum value from a column. This kind of measure is useful for understanding the distribution of column values, determining the most recent timestamps of key events, and creating booleans from CASE WHEN statements to flatten semi-structured data. + +## How to use the SQL MAX function in a query + +Use the following syntax to find the maximum value of a field: + +`max()` + +Since MAX is an aggregate function, you’ll need a GROUP BY statement in your query if you’re looking at counts broken out by dimension(s). If you’re calculating the standalone maximum of fields without the need to break them down by another field, you don’t need a GROUP BY statement. + +MAX can also be used as a window function to operate across specified or partitioned rows. + +Let’s take a look at a practical example using MAX and GROUP BY below. + +### MAX example + +```sql +select + date_part('month', order_date) as order_month, + max(amount) as max_amaount +from {{ ref('orders') }} +group by 1 +``` + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +This simple query is something you may do while doing initial exploration of your data; it will return the maximum order `amount` per order month that appear in the Jaffle Shop’s `orders` table: + +| order_month | max_amount | +|:---:|:---:| +| 1 | 58 | +| 2 | 30 | +| 3 | 56 | +| 4 | 26 | + +## SQL MAX function syntax in Snowflake, Databricks, BigQuery, and Redshift + +All modern data warehouses support the ability to use the MAX function (and follow the same syntax!). + +## MAX function use cases + +We most commonly see queries using MAX to: + +- Perform initial data exploration on a dataset to understand the distribution of column values. +- Identify the most recent timestamp for key events (ex. `max(login_timestamp_utc) as last_login`). +- Create descriptive boolean values from case when statements (ex. `max(case when status = 'complete' then 1 else 0 end) as has_complete_order`). +- Establish the most recent timestamp from a table to filter on rows appropriately for [incremental model builds](https://docs.getdbt.com/docs/build/incremental-models). + +This isn’t an extensive list of where your team may be using MAX throughout your development work, dbt models, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/aggregate-functions/sql-min.md b/website/docs/sql-reference/aggregate-functions/sql-min.md new file mode 100644 index 00000000000..6080bb20c0b --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-min.md @@ -0,0 +1,62 @@ +--- +id: min +title: SQL MIN +description: The MIN aggregate function allows you to compute the minimum value from a column or across a set of rows for a column. +slug: /sql-reference/min +--- + + + Working with SQL MIN + + +SQL MIN, MAX, SUM…the aggregate functions that you’ll live and die by as an analytics practitioner. In this post, we’re going to unpack the SQL MIN function, how to use it, and why it's valuable in data work. + +The MIN aggregate function allows you to compute the minimum value from a column or across a set of rows for a column. The results from the MIN function are useful for understanding the distribution of column values and determining the first timestamps of key events. + +## How to use the MIN function in a query + +Use the following syntax in a query to find the minimum value of a field: + +`min()` + +Since MIN is an aggregate function, you’ll need a GROUP BY statement in your query if you’re looking at counts broken out by dimension(s). If you’re calculating the standalone minimum of fields without the need to break them down by another field, you don’t need a GROUP BY statement. + +MIN can also be used as a window function to operate across specified or partitioned rows. + +Let’s take a look at a practical example using MIN below. + +### MIN example + +```sql +select + customer_id, + min(order_date) as first_order_date, + max(order_date) as last_order_date +from {{ ref('orders') }} +group by 1 +limit 3 +``` + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +This simple query is returning the first and last order date for a customer in the Jaffle Shop’s `orders` table: + +| customer_id | first_order_date | last_order_date | +|:---:|:---:|:---:| +| 1 | 2018-01-01 | 2018-02-10 | +| 3 | 2018-01-02 | 2018-03-11 | +| 94 | 2018-01-04 | 2018-01-29 | + +## SQL MIN function syntax in Snowflake, Databricks, BigQuery, and Redshift + +All modern data warehouses support the ability to use the MIN function (and follow the same syntax). + +## MIN function use cases + +We most commonly see queries using MIN to: +- Perform initial data exploration on a dataset to understand the distribution of column values. +- Identify the first timestamp for key events (ex. `min(login_timestamp_utc) as first_login`). + +This isn’t an extensive list of where your team may be using MIN throughout your development work, dbt models, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/aggregate-functions/sql-round.md b/website/docs/sql-reference/aggregate-functions/sql-round.md new file mode 100644 index 00000000000..053a2ebdd8e --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-round.md @@ -0,0 +1,65 @@ +--- +id: round +title: SQL ROUND +description: The ROUND aggregate function allows you to round numeric fields or values in SQL to the number of decimal places of your choosing. +slug: /sql-reference/round +--- + + + Working with SQL ROUND + + +If you’re reading this, that probably means you’re a data person. And as a data person who’s likely modeling data for analytics use cases, you’re going to need to round data from time to time. For the unacquainted, "rounding" is making a number simpler so that it's easier to understand while keeping it close to its original value. In data, a common use case for rounding is to decrease the number of decimal places a numeric record has. + +To round numeric fields or values in SQL, you’re going to use the handy ROUND function. + +## How to use the SQL ROUND function + +The syntax for using ROUND function looks like the following: + +```sql +round(, [optional] ) +``` +In this function, you’ll need to input the *numeric* field or data you want rounded and pass in an optional number to round your field by. For most data warehouses, the number of decimal places is defaulted to 0 or 1, meaning if you rounded 20.00 using `round(20.00)`, it would return 20 or 20.0 (depending on your data warehouse). + +### SQL ROUND function example + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +You can round some of the numeric fields of the Jaffle Shop’s `orders` model using the following code: + +```sql +select + cast(order_id as string) as order_id, + order_date, + amount, + round(amount, 1) as rounded_amount +from {{ ref('orders') }} +``` + +After running this query, the resulting `orders` table will look a little something like this: + +| order_id | order_date | amount | rounded_amount | +|---|---|---|---| +| 1 | 2018-01-01 | 10.000000 | 10.0 | +| 2 | 2018-01-02 | 20.000000 | 20.0 | +| 3 | 2018-01-04 | 1.000000 | 1.0 | + +The new `rounded_amount` column is the `amount` fielded rounded to 1 decimal place. + +For most data warehouses, the returned data from the ROUND function should be the same as the input data. If you input a float type into the ROUND function, the returned rounded number should also be a float. + +## SQL ROUND function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Google BigQuery, Amazon Redshift, Snowflake, and Databricks all support the ability to round numeric columns and data. In addition, the syntax to round is the same across all of them using the ROUND function. + +## ROUND function use cases + +If you find yourself rounding numeric data, either in data models or ad-hoc analyses, you’re probably rounding to improve the readability and usability of your data using downstream [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) or [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts). Specifically, you’ll likely use the ROUND function to: + +- Make numeric calculations using division or averages a little cleaner and easier to understand +- Create concrete buckets of data for a cleaner distribution of values during ad-hoc analysis + +You’ll additionally likely see the ROUND function used in your BI tool as it generates rounded clean numbers for business users to interact with. diff --git a/website/docs/sql-reference/aggregate-functions/sql-sum.md b/website/docs/sql-reference/aggregate-functions/sql-sum.md new file mode 100644 index 00000000000..cb9235798d2 --- /dev/null +++ b/website/docs/sql-reference/aggregate-functions/sql-sum.md @@ -0,0 +1,64 @@ +--- +id: sum +title: SQL SUM +description: The SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. +slug: /sql-reference/sum +--- + + + Working with SQL SUM + + +The SQL SUM function is handy and ever-present in data work. Let’s unpack what it is, how to use it, and why it's valuable. + +Jumping into it, the SUM aggregate function allows you to calculate the sum of a numeric column or across a set of rows for a column. Ultimately, the SUM function is incredibly useful for calculating meaningful business metrics, such as Lifetime Value (LTV), and creating key numeric fields in [`fct_` and `dim_` models](/terms/dimensional-modeling). + +## How to use the SUM function in a query + +Use the following syntax in a query to find the sum of a numeric field: + +`sum()` + +Since SUM is an aggregate function, you’ll need a GROUP BY statement in your query if you’re looking at counts broken out by dimension(s). If you’re calculating the standalone sum of a numeric field without the need to break them down by another field, you don’t need a GROUP BY statement. + +SUM can also be used as a window function to operate across specified or partitioned rows. You can additionally pass a DISTINCT statement into a SUM function to only sum distinct values in a column. + +Let’s take a look at a practical example using the SUM function below. + +### SUM example + +```sql +select + customer_id, + sum(order_amount) as all_orders_amount +from {{ ref('orders') }} +group by 1 +limit 3 +``` + +:::note What dataset is this? +This example is querying from a sample dataset created by dbt Labs called [jaffle_shop](https://github.com/dbt-labs/jaffle_shop). +::: + +This simple query is returning the summed amount of all orders for a customer in the Jaffle Shop’s `orders` table: + +| customer_id | all_orders_amount | +|:---:|:---:| +| 1 | 33 | +| 3 | 65 | +| 94 | 24 | + +## SQL SUM function syntax in Snowflake, Databricks, BigQuery, and Redshift + +All modern data warehouses support the ability to use the SUM function (and follow the same syntax). + +## SUM function use cases + +We most commonly see queries using SUM to: + +- Calculate the cumulative sum of a metric across a customer/user id using a CASE WHEN statement (ex. `sum(case when order_array is not null then 1 else 0 end) as count_orders`) +- Create [dbt metrics](/docs/build/metrics) for key business values, such as LTV +- Calculate the total of a field across a dimension (ex. total session time, total time spent per ticket) that you typically use in `fct_` or `dim_` models +- Summing clicks, spend, impressions, and other key ad reporting metrics in tables from ad platforms + +This isn’t an extensive list of where your team may be using SUM throughout your development work, dbt models, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/clauses/sql-having.md b/website/docs/sql-reference/clauses/sql-having.md new file mode 100644 index 00000000000..3d7813add5b --- /dev/null +++ b/website/docs/sql-reference/clauses/sql-having.md @@ -0,0 +1,90 @@ +--- +id: having +title: SQL HAVING +description: Read this guide to learn about the SQL HAVING clause in dbt. +slug: /sql-reference/having +--- + + + Working with the HAVING clause in SQL + + +SQL HAVING is just one of those little things that are going to make your ad hoc data work a little easier. + +A not-so-fun fact about the [WHERE clause](/sql-reference/where) is that you can’t filter on aggregates with it…that’s where HAVING comes in. With HAVING, you can not only define an aggregate in a [select](/sql-reference/select) statement, but also filter on that newly created aggregate within the HAVING clause. + +This page will walk through how to use HAVING, when you should use it, and discuss data warehouse support for it. + + +## How to use the HAVING clause in SQL + +The HAVING clause essentially requires one thing: an aggregate field to evaluate. Since HAVING is technically a boolean, it will return rows that execute to true, similar to the WHERE clause. + +The HAVING condition is followed after a [GROUP BY statement](/sql-reference/group-by) and optionally enclosed with an ORDER BY statement: + +```sql +select + -- query +from +group by +having condition +[optional order by] +``` + +That example syntax looks a little gibberish without some real fields, so let’s dive into a practical example using HAVING. + +### SQL HAVING example + + + + +```sql +select + customer_id, + count(order_id) as num_orders +from {{ ref('orders') }} +group by 1 +having num_orders > 1 --if you replace this with `where`, this query would not successfully run +``` + + + +```sql +with counts as ( + select + customer_id, + count(order_id) as num_orders + from {{ ref('orders') }} + group by 1 +) +select + customer_id, + num_orders +from counts +where num_orders > 1 +``` + + + + +This simple query using the sample dataset [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return customers who have had more than one order: + +| customer_id | num_orders | +|:---:|:---:| +| 1 | 2 | +| 3 | 3 | +| 94 | 2 | +| 64 | 2 | +| 54 | 4 | + +The query above using the utilizes more lines compared to the simpler query using HAVING, but will produce the same result. + +## SQL HAVING clause syntax in Snowflake, Databricks, BigQuery, and Redshift + +[Snowflake](https://docs.snowflake.com/en/sql-reference/constructs/having.html), [Databricks](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-qry-select-having.html), [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#having_clause), and [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_HAVING_clause.html) all support the HAVING clause and the syntax for using HAVING is the same across each of those data warehouses. diff --git a/website/docs/sql-reference/clauses/sql-limit.md b/website/docs/sql-reference/clauses/sql-limit.md new file mode 100644 index 00000000000..74cc2e12123 --- /dev/null +++ b/website/docs/sql-reference/clauses/sql-limit.md @@ -0,0 +1,74 @@ +--- +id: limit +title: SQL LIMIT +description: Read this guide to learn about the SQL LIMIT clause in dbt. +slug: /sql-reference/limit +--- + + + Working with the SQL LIMIT clause + + +When you’re developing data models or drafting up a query, do you usually need to see all results from it? Not normally. Hence, we LIMIT. + +Adding the LIMIT clause to a query will limit the number of rows returned. It’s useful for when you’re developing data models, ensuring SQL in a query is functioning as expected, and wanting to save some money during development periods. + +## How to use the LIMIT clause in a query + +To limit the number of rows returned from a query, you would pass the LIMIT in the last line of the query with the number of rows you want returned: + +```sql +select + some_rows +from my_data_source +limit +``` + +Let’s take a look at a practical example using LIMIT below. + +### LIMIT example + +```sql +select + order_id, + order_date, + rank () over (order by order_date) as order_rnk +from {{ ref('orders') }} +order by 2 +limit 5 +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return these exact 5 rows: + +| order_id | order_date | order_rnk | +|:---:|:---:|:---:| +| 1 | 2018-01-01 | 1 | +| 2 | 2018-01-02 | 2 | +| 3 | 2018-01-04 | 3 | +| 4 | 2018-01-05 | 4 | +| 5 | 2018-01-05 | 4 | + +After ensuring that this is the result you want from this query, you can omit the LIMIT in your final data model. + +:::tip Save money and time by limiting data in development +You could limit your data used for development by manually adding a LIMIT statement, a WHERE clause to your query, or by using a [dbt macro to automatically limit data based](https://docs.getdbt.com/guides/legacy/best-practices#limit-the-data-processed-when-in-development) on your development environment to help reduce your warehouse usage during dev periods. +::: + +## LIMIT syntax in Snowflake, Databricks, BigQuery, and Redshift + +All modern data warehouses support the ability to LIMIT a query and the syntax is also the same across them. Use the table below to read more on the documentation for limiting query results in your data warehouse. + +| Data warehouse | LIMIT support? | +|:---:|:---:| +| Snowflake | ✅ | +| Databricks | ✅ | +| Amazon Redshift | ✅ | +| Google BigQuery | ✅ | + +## LIMIT use cases + +We most commonly see queries limited in data work to: +- Save some money in development work, especially for large datasets; just make sure the model works across a subset of the data instead of all of the data 💸 +- Paired with an ORDER BY statement, grab the top 5, 10, 50, 100, etc. entries from a dataset + +This isn’t an extensive list of where your team may be using LIMIT throughout your development work, but it contains some common scenarios analytics engineers face day-to-day. diff --git a/website/docs/sql-reference/clauses/sql-order-by.md b/website/docs/sql-reference/clauses/sql-order-by.md new file mode 100644 index 00000000000..660794adc14 --- /dev/null +++ b/website/docs/sql-reference/clauses/sql-order-by.md @@ -0,0 +1,65 @@ +--- +id: order-by +title: SQL ORDER BY +description: Read this guide to learn about the SQL ORDER BY clause in dbt. +slug: /sql-reference/order-by +--- + + + Working with the SQL ORDER BY clause + + +The ORDER BY clause allows you to specify the resulting row order for a query. In practice, you use the ORDER BY clause to indicate which field(s) you want to order by and in what type of order you want (ascending or descending). It’s useful to leverage during ad hoc analyses and for creating appropriate column values for partitioned rows in window functions. + +## How to use the SQL ORDER BY clause + +ORDER BY clauses have multiple use cases in analytics work, but we see it most commonly utilized to: +- Order a query or subquery result by a column or group of columns +- Appropriately order a subset of rows in a window function + +To use the ORDER BY clause to a query or model, use the following syntax: + +```sql +select + column_1, + column_2 +from source_table +order by --comes after FROM, WHERE, and GROUP BY statements +``` +You can order a query result by multiple columns, represented by their column name or by their column number in the select statement (ex. `order by column_2 == order by 2`). You can additionally specify the ordering type you want (ascending or descending) to return the desired row order. + +Let’s take a look at a practical example using ORDER BY. + +### ORDER BY example + +```sql +select + date_trunc('month, order_date') as order_month, + round(avg(amount)) as avg_order_amount +from {{ ref('orders') }} +group by 1 +order by 1 desc +``` + +This query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return the rounded order amount per each order month in descending order: + +| order_month | avg_order_amount | +|:---:|:---:| +| 2018-04-01 | 17 | +| 2018-03-01 | 18 | +| 2018-02-01 | 15 | +| 2018-01-01 | 17 | + +## SQL ORDER BY syntax in Snowflake, Databricks, BigQuery, and Redshift + +Since the ORDER BY clause is a SQL fundamental, data warehouses, including Snowflake, Databricks, Google BigQuery, and Amazon Redshift, all support the ability to add ORDER BY clauses in queries and window functions. + +## ORDER BY use cases + +We most commonly see the ORDER BY clause used in data work to: +- Analyze data for both initial exploration of raw data sources and ad hoc querying of [mart datasets](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) +- Identify the top 5/10/50/100 of a dataset when used in pair with a [LIMIT](/sql-reference/limit) +- (For Snowflake) Optimize the performance of large incremental models that use both a `cluster_by` [configuration](https://docs.getdbt.com/reference/resource-configs/snowflake-configs#using-cluster_by) and ORDER BY statement +- Control the ordering of window function partitions (ex. `row_number() over (partition by user_id order by updated_at)`) + +This isn’t an extensive list of where your team may be using the ORDER BY clause throughout your dbt models, ad hoc queries, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. diff --git a/website/docs/sql-reference/clauses/sql-where.md b/website/docs/sql-reference/clauses/sql-where.md new file mode 100644 index 00000000000..cd10a88efb7 --- /dev/null +++ b/website/docs/sql-reference/clauses/sql-where.md @@ -0,0 +1,43 @@ +--- +id: where +title: SQL WHERE +description: Read this guide to learn about the SQL WHERE clause in dbt. +slug: /sql-reference/where +--- + + + Working with the SQL WHERE clause + + +If the humble [SELECT statement](/sql-reference/select) is an analytics engineer kitchen knife, the WHERE clause is the corresponding knife sharpener: no (good) cooking (or data modeling) is happening without it. + +The WHERE clause is a fundamental SQL statement—it allows you to appropriately filter your data models and queries, so you can look at specific subsets of data based on your requirements. + +## How to use the SQL WHERE clause + +The syntax for using WHERE clause in a SELECT statement looks like the following: + +```sql +select + order_id, + customer_id, + amount +from {{ ref('orders') }} +where status != 'returned' +``` + +In this query, you’re filtering for any order from the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` model whose status is not `returned` by adding a WHERE clause after the FROM statement. You could additionally filter on string, numeric, date, or other data types to meet your query conditions. + +You will likely see WHERE clauses show up 99.99% of the time in a typical query or dbt model. The other .01% is probably in a statement, such as DELETE or ALTER, to modify specific rows in tables. + +## SQL WHERE clause syntax in Snowflake, Databricks, BigQuery, and Redshift + +Since the WHERE clause is a SQL fundamental, Google BigQuery, Amazon Redshift, Snowflake, and Databricks all support the ability to filter queries and data models using it. In addition, the syntax to round is the same across all of them using the WHERE clause. + +## SQL WHERE clause use cases + +WHERE clauses are probably some of the most widely used SQL capabilities, right after SELECT and FROM statements. Below is a non-exhaustive list of where you’ll commonly see WHERE clauses throughout dbt projects and data work: +- Removing source-deleted rows from staging models to increase accuracy and improve downstream model performance +- Filtering out employee records from models +- Performing ad-hoc analysis on specific rows or users, either in a dbt model, BI tool, or ad-hoc query +- Paired with IN, LIKE, NOT IN clauses to create more generalized or a group of specific requirements to filter on diff --git a/website/docs/sql-reference/data-type/sql-data-types.md b/website/docs/sql-reference/data-type/sql-data-types.md new file mode 100644 index 00000000000..844619032d9 --- /dev/null +++ b/website/docs/sql-reference/data-type/sql-data-types.md @@ -0,0 +1,67 @@ +--- +id: data-types +title: SQL Data Types +description: The different data types in SQL are numeric, string, date, booleans, and semi-structured. This content covers the differences between them and their subcategories. +slug: /sql-reference/data-types +--- + + + What are the SQL data types? + + +Below, we’ll unpack the different umbrellas of data types and the unique data types that fall under each category. + +## Numeric data types + +There are many different numeric types in SQL and that makes sense because…we’re data people and numbers are important, bit length is important, decimal places are even more important, and numbers are ultimately what allow stakeholders to make certain decisions. + +There’s slight differentiation in which numeric data types are supported across each data warehouse, but fundamentally, it’s most important to understand the differences between integers, decimals, and floats. + +| **Type** | **Definition** | **Use cases** | +|:---:|:---:|:---:| +| Integer | Integers are numbers without fractions. Think 1, 2, 72384191203—nice, clean numbers. | Though many column values may look like integers (and in theory, they are), they’re often reflected or cast as decimal/numeric types to offer future precision and scale if required. | +| Decimal | Decimal, also known as the NUMERIC type, is a numeric data type that has a default precision of 38 and a scale of 0. | Typical numeric columns in datasets, such as lifetime value or user ids. Most likely the most common form of numeric data in your tables. | +| Float | Floats are used to provide approximate numeric values of fractions, with a precision of up to 64 bits. Floats offer a larger range of values compared to decimals. | Columns that are percentages; longitude/latitude. | + +## String data types + +Strings are everywhere in data—they allow folks to have descriptive text field columns, use regex in their data work, and honestly, they just make the data world go ‘round. To formalize it, a string type is a word, or the combination of characters that you’ll typically see encased in single quotes (ex. 'Jaffle Shop', '1234 Shire Lane', 'Plan A'). + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the string data type. They may have slightly varying sub-types for strings; some data warehouses such as Snowflake and Redshift support `text`, `char`, and `character` string types which typically differ in byte length in comparison to the generic string type. + +Again, since most string type columns are inherent in your data, you’ll likely be ok using generic varchar or strings for casting, but it never hurts to read up on the docs specific to your data warehouse string support! + +## Date data types + +Dates, timestamps, timezones—all the fun (slightly painful) data things that make analytics engineers real data practitioners (people who occasionally want to yank their hair out). + +Below, we’ll unpack dates, datetimes, times, and timestamps, to help you better understand the core date data types. + +Working our way from simplest to most complex, dates, typically represented with the DATE type are what you typically associate with a calendar date (ex. 2022-12-16), and are limited to the range of 0001-01-01 to 9999-12-31. + +DATETIME values contain both calendar date and time (ex. 2022-12-16 02:33:24) and may additionally include the sub-seconds. TIME types are typically represented as the HH:MM:SS of a time and don’t contain a specified timezone. + +TIMESTAMP data types allow for the greatest specification and precision of a point in time and can be specified with or without a timezone. Most event-driven data fields (ex. Order completed time, account created time, user churned time) will be represented as timestamps in your data sources. Some data warehouses, such as [Amazon Redshift](https://docs.amazonaws.cn/en_us/redshift/latest/dg/r_Datetime_types.html) and [Snowflake](https://docs.snowflake.com/en/sql-reference/data-types-datetime.html#date-time-data-types), support different timestamp options that allow for explicit specification of a timezone (or lack thereof). + +In general, the two best practices when it comes to dates and times are: +1. Keep (or convert) timestamps to the same timezone. +2. Keep date types in the most specific date-type as possible: you can always zoom out of a timestamp to get a date, but can’t get a timestamp from a date. + +You’ll ultimately leverage handy date functions to zoom in and out of dates, convert dates, or add times to dates. + +## Booleans + +A boolean is a column value that is either true, false, or null. In your datasets, you’ll use booleans to create `is_` or `has_` fields to create clear segments in your data; for example, you may use booleans to indicate whether a customer has churned (`has_churned`) or denote employee records (`is_employee`), or filter out records that have been removed from your source data (`is_deleted`). + +Typically, you’ll see `True` or `False` as the actual boolean values in a column, but may also choose to use numeric values, such as 1 and 0, to represent true and false values. The strings of `True` and `False`, however, tend to be a bit easier to read and interpret for end business users. + +## Semi-structured data types + +Semi-structured data types are a great way to combine or aggregate data across multiple fields; you may also find yourself in the inverse situation where you need to unpack semi-structured data, such as a JSON object, and unnest it into its individual key-value pair. The two primary semi-structured data types you’ll see across data warehouses are JSON and arrays. + +Below, we’ll unpack what the difference is between the two and provide an example of each one. + +| **Type** | **Definition** | **Example** | **Use case** | +|:---:|:---:|:---:|:---:| +| JSON | When looking at data formatted in JSON, we say that the data is stored in JSON objects. These are composed of key-value pairs. JSON objects are enclosed in curly brackets ({ }) and each key-value pair is separated by a comma. Read more about using JSON here. | {"customer_id":2947, "order_id":4923, "order_items":"cheesecake"} | One of the great things about JSON data is that it doesn't require schema definition—until you unnest it. Extract exactly what you need from your JSON object, and you can forget about the rest! JSON values will often come inherent in your data sources, so learn how to unnest them and your life will become easier. | +| Array | Similar to arrays in other programming languages, an array contains multiple elements that are accessible via its position in that array. | ["cheesecake", "cupcake", "brownie"] | Arrays are a clear way to aggregate multiple values together to create a singular value. Many use cases here, but be cautious: using aggregate functions, such as `array_agg` , can become inefficient on large datasets. | \ No newline at end of file diff --git a/website/docs/sql-reference/data-type/sql-strings.md b/website/docs/sql-reference/data-type/sql-strings.md new file mode 100644 index 00000000000..fe5f0cab0aa --- /dev/null +++ b/website/docs/sql-reference/data-type/sql-strings.md @@ -0,0 +1,60 @@ +--- +id: strings +title: SQL Strings +description: Strings in SQL are words or combination of characters that you’ll typically see encased in single quotes (ex. 'Jaffle Shop', '1234 Shire Lane', 'Plan A'). +slug: /sql-reference/strings +--- + + + Working with the SQL Strings + + +We can almost guarantee that there is not a single dbt model or table in your database that doesn’t have at least one column of a string type. + +Strings are everywhere in data—they allow folks to have descriptive text field columns, use regex in their data work, and honestly, they just make the data world go ‘round. + +Below, we’ll unpack the different string formats you might see in a modern cloud data warehouse and common use cases for strings. + +## Using SQL strings + +Strings are inherent in your data—they’re the name fields that someone inputs when they sign up for an account, they represent the item someone bought from your ecommerce store, they describe the customer address, and so on. + +To formalize it a bit, a string type is a word, or the combination of characters that you’ll typically see encased in single quotes (ex. 'Jaffle Shop', '1234 Shire Lane', 'Plan A'). + +Most often, when you’re working with strings in a dbt model or query, you’re: + +- Changing the casing (uppering/lowering) to create some standard for your string type columns in your data warehouse +- Concatenating strings together to create more robust, uniform, or descriptive string values +- Unnesting or more complex structured data objects and converting those values to explicit strings +- Casting a column of a different type to a string for better compatibility or usability in a BI tool +- Filtering queries on certain string values +- Creating a new string column type based off a CASE WHEN statement to bucket data by +- Splitting a string into a substring + +This is not an exhaustive list of string functionality or use cases, but contains some common scenarios analytics engineers face day-to-day. + +### Strings in an example query + +```sql +select + date_trunc('month', order_date)::string as order_month, + round(avg(amount)) as avg_order_amount +from {{ ref('orders') }} +where status not in ('returned', 'return_pending') +group by 1 +``` + +This query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return the order month as a string and rounded order amount for only orders with statuses not equal to `returned` or `pending` string values: + +| order_month | avg_order_amount | +|:---:|:---:| +| 2018-01-01 | 18 | +| 2018-02-01 | 15 | +| 2018-03-01 | 18 | +| 2018-04-01 | 17 | + +## String support in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the string [data type](/sql-reference/data-types#string-data-types). They may have slightly varying sub-types for strings; some data warehouses such as Snowflake and Redshift support text, char, and character string types which typically differ in byte length in comparison to the generic string type. + +Again, since most string type columns are inherent in your data, you’ll likely be ok using generic varchar or strings for casting, but it never hurts to read up on the docs specific to your data warehouse string support! diff --git a/website/docs/sql-reference/date-functions/sql-date-trunc.md b/website/docs/sql-reference/date-functions/sql-date-trunc.md new file mode 100644 index 00000000000..3cdcb76ac5f --- /dev/null +++ b/website/docs/sql-reference/date-functions/sql-date-trunc.md @@ -0,0 +1,79 @@ +--- +id: datetrunc +title: SQL DATE_TRUNC +description: The DATE_TRUNC function will truncate a date or time to the first instance of a given date part. You can truncate to the weeks, months, years, or other date parts for a date or time field. +slug: /sql-reference/date-trunc +--- + + + Working with the SQL DATE_TRUNC function + + +In general, data people prefer the more granular over the less granular. [Timestamps > dates](https://docs.getdbt.com/blog/when-backend-devs-spark-joy#signs-the-data-is-sparking-joy), daily data > weekly data, etc.; having data at a more granular level always allows you to zoom in. However, you’re likely looking at your data at a somewhat zoomed-out level—weekly, monthly, or even yearly. To do that, you’re going to need a handy dandy function that helps you round out date or time fields. + +The DATE_TRUNC function will truncate a date or time to the first instance of a given date part. Wordy, wordy, wordy! What does this really mean? If you were to truncate `2021-12-13` out to its month, it would return `2021-12-01` (the first day of the month). + +Using the DATE_TRUNC function, you can truncate to the weeks, months, years, or other date parts for a date or time field. This can make date/time fields easier to read, as well as help perform cleaner time-based analyses. + +Overall, it’s a great function to use to help you aggregate your data into specific date parts while keeping a date format. However, the DATE_TRUNC function isn’t your swiss army knife–it’s not able to do magic or solve all of your problems (we’re looking at you [star](https://getdbt.com/sql-foundations/star-sql-love-letter/)). Instead, DATE_TRUNC is your standard kitchen knife—it’s simple and efficient, and you almost never start cooking (data modeling) without it. + +## How to use the DATE_TRUNC function​ + +For the DATE_TRUNC function, there are two arguments you must pass in: + +- The date part: This is the days/months/weeks/years (level) you want your field to be truncated out to +- The date/time you want to be truncated + +The DATE_TRUNC function can be used in [SELECT](/sql-reference/select) statements and [WHERE](/sql-reference/where) clauses. + +Most, if not all, modern cloud data warehouses support some type of the DATE_TRUNC function. There may be some minor differences between the argument order for DATE_TRUNC across data warehouses, but the functionality very much remains the same. + +Below, we’ll outline some of the slight differences in the implementation between some of the data warehouses. + +## The DATE_TRUNC function in Snowflake and Databricks​ + +In [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/date_trunc.html) and [Databricks](https://docs.databricks.com/sql/language-manual/functions/date_trunc.html), you can use the DATE_TRUNC function using the following syntax: + +```sql +date_trunc(, ) +``` + +In these platforms, the is passed in as the first argument in the DATE_TRUNC function. + +## The DATE_TRUNC function in Google BigQuery and Amazon Redshift​ + +In [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_trunc) and [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_DATE_TRUNC.html), the `` is passed in as the first argument and the `` is the second argument: + +```sql +date_trunc(, ) +``` + +A note on BigQuery: BigQuery’s DATE_TRUNC function supports the truncation of date types, whereas Snowflake, Redshift, and Databricks’ `` can be a date or timestamp data type. BigQuery also supports DATETIME_TRUNC and TIMESTAMP_TRUNC functions to support truncation of more granular date/time types. + +## A dbt macro to remember​ + +Why Snowflake, Amazon Redshift, Databricks, and Google BigQuery decided to use different implementations of essentially the same function is beyond us and it’s not worth the headache trying to figure that out. Instead of remembering if the `` or the `` comes first, (which, let’s be honest, we can literally never remember) you can rely on a dbt Core macro to help you get away from finicky syntax. + +With dbt v1.2, [adapters](https://docs.getdbt.com/docs/supported-data-platforms) now support [cross-database macros](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros) to help you write certain functions, like DATE_TRUNC and DATEDIFF, without having to memorize sticky function syntax. + +Using the [Jaffle Shop](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql), a simple dataset and dbt project, you can truncate the `order_date` from the orders table using the [dbt DATE_TRUNC macro](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros#date_trunc): + +```sql +select + order_id, + order_date, + {{ date_trunc("week", "order_date") }} as order_week, + {{ date_trunc("month", "order_date") }} as order_month, + {{ date_trunc("year", "order_date") }} as order_year +from {{ ref('orders') }} +``` + +Running the above would product the following sample results: + +| **order_id** | **order_date** | **order_week** | **order_month** | **order_year** | +|:---:|:---:|:---:|:---:|:---:| +| 1 | 2018-01-01 | 2018-01-01 | 2018-01-01 | 2018-01-01 | +| 70 | 2018-03-12 | 2018-03-12 | 2018-03-01 | 2018-01-01 | +| 91 | 2018-03-31 | 2018-03-26 | 2018-03-01 | 2018-01-01 | + +The `order_week`, `order_month`, and `order_yea`r fields are the truncated values from the `order_date` field. \ No newline at end of file diff --git a/website/docs/sql-reference/date-functions/sql-dateadd.md b/website/docs/sql-reference/date-functions/sql-dateadd.md new file mode 100644 index 00000000000..8081ae14100 --- /dev/null +++ b/website/docs/sql-reference/date-functions/sql-dateadd.md @@ -0,0 +1,92 @@ +--- +id: dateadd +title: SQL DATEADD +description: The DATEADD function in SQL adds a time/date interval to a date and then returns the date. This allows you to add or subtract a certain period of time from a given start date. +slug: /sql-reference/dateadd +--- + + + What is the SQL DATEADD Function? + + +If you’ve used the DATEADD SQL function any number of times, you’ve googled the syntax of the function all of those times except one, when I decided to hit the “are you feeling lucky” button and go for it. + +In switching between SQL dialects (BigQuery, Postgres and Snowflake are my primaries), it's almost impossible to remember the argument order (or exact function name) of dateadd. + +This article will go over how the DATEADD function works, the nuances of using it across the major cloud warehouses, and how to standardize the syntax variances using dbt macro. + +## What is the DATEADD SQL function? + +The DATEADD function in SQL adds a time/date interval to a date and then returns the date. This allows you to add or subtract a certain period of time from a given start date. + +Sounds simple enough, but this function lets you do some pretty useful things like calculating an estimated shipment date based on the ordered date. + +## Differences in DATEADD syntax across data warehouse platforms + +All of them accept the same rough parameters, in slightly different syntax and order: + +- Start / from date +- Datepart (day, week, month, year) +- Interval (integer to increment by) + +The *functions themselves* are named slightly differently, which is common across SQL dialects. + +### For example, the DATEADD function in Snowflake… + +``` +dateadd( {{ datepart }}, {{ interval }}, {{ from_date }} ) +``` + +*Hour, minute and second are supported!* + +### For example, the DATEADD function in Snowflake… + +``` +dateadd( {{ datepart }}, {{ interval }}, {{ from_date }} ) +``` + +*Hour, minute and second are supported!* + +### The DATEADD function in Databricks + +```sql +date_add( {{ startDate }}, {{ numDays }} ) +``` + +### The DATEADD function in BigQuery… + +```sql +date_add( {{ from_date }}, INTERVAL {{ interval }} {{ datepart }} ) +``` + +*Dateparts of less than a day (hour / minute / second) are not supported.* + +### The DATEADD function in Postgres… + + +Postgres doesn’t provide a dateadd function out of the box, so you’ve got to go it alone - but the syntax looks very similar to BigQuery’s function… + +```sql +{{ from_date }} + (interval '{{ interval }} {{ datepart }}') +``` + +Switching back and forth between those SQL syntaxes usually requires a quick scan through the warehouse’s docs to get back on the horse. + +## Standardizing your DATEADD SQL syntax with a dbt macro + +But couldn’t we be doing something better with those keystrokes, like typing out and then deleting a tweet? + +dbt v1.2 helps us smooth out these wrinkles of writing [SQL across data warehouses](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros). + +Instead of looking up the syntax each time you use it, you can just write it the same way each time, and the macro compiles it to run on your chosen warehouse: + +``` +{{ dateadd(datepart, interval, from_date_or_timestamp) }} +``` + +Adding 1 month to a specific date would look like… + +``` +{{ dateadd(datepart="month", interval=1, from_date_or_timestamp="'2021-08-12'") }} +``` + diff --git a/website/docs/sql-reference/date-functions/sql-datediff.md b/website/docs/sql-reference/date-functions/sql-datediff.md new file mode 100644 index 00000000000..318bc5c5c12 --- /dev/null +++ b/website/docs/sql-reference/date-functions/sql-datediff.md @@ -0,0 +1,76 @@ +--- +id: datediff +title: SQL DATEDIFF +description: The DATEDIFF function will return the difference in specified units (ex. days, weeks, years) between a start date/time and an end date/time. +slug: /sql-reference/datediff +--- + + + What is the SQL DATEDIFF Function? + + +*“How long has it been since this customer last ordered with us?”*
          +*“What is the average number of days to conversion?”* + +Business users will have these questions, data people will have to answer these questions, and the only way to solve them is by calculating the time between two different dates. Luckily, there’s a handy DATEDIFF function that can do that for you. + +The DATEDIFF function will return the difference in specified units (ex. days, weeks, years) between a start date/time and an end date/time. It’s a simple and widely used function that you’ll find yourself using more often than you expect. + +DATEDIFF is a little bit like your favorite pair of socks; you’ll usually find the first one easily and feel like the day is going to be great. But for some reason, the matching sock requires a little digging in the drawer. DATEDIFF is this pair of socks—you’ll inevitably find yourself Googling the syntax almost every time you use it, but you can’t go through your day without using it. + +This page will go over how to use the DATEDIFF function across different data warehouses and how to write more standardized DATEDIFF functions using a dbt macro (or successfully find your socks as a pair in one go). + +## How to use the DATEDIFF function​ + +For the DATEDIFF function, there are three elements, or arguments, passed in: + +- The date part: This is the days/months/weeks/years (unit) of the difference calculated +- The first (start) date/time +- The second (end) date/time + +The DATEDIFF function can be used in [SELECT](/sql-reference/select) statements and WHERE clauses. + +Most, if not all, modern cloud data warehouses support some type of the DATEDIFF function. There may be some minor differences between the argument order and function name for DATEDIFF across data warehouses, but the functionality very much remains the same. + +Below, we’ll outline some of the slight differences in the implementation between some data warehouses. + +## SQL DATEDIFF function syntax in Snowflake, Databricks, and Redshift + +The syntax for using the DATEDIFF function in Snowflake and Amazon Redshift, and Databricks looks like the following: + +```sql +datediff(, , ) +``` + +A note on Databricks: Databricks additionally supports a separate [DATEDIFF function](https://docs.databricks.com/sql/language-manual/functions/datediff.html) that takes only two arguments: a start date and an end date. The function will always return the difference between two dates in days. + +### DATEDIFF in Google BigQuery​ + +The syntax for using the DATEDIFF function in [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/datetime_functions#datetime_diff) looks like the following: + +- Unlike in Snowflake, Amazon Redshift, and Databricks where the `` is passed as the first argument, the `` is passed in as the last argument in Google BigQuery. +- Google BigQuery also calls the function DATETIME_DIFF with an additional underscore separating the function name. This is on par with [Google BigQuery’s preference to have underscores in function names](https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions). +- The DATETIME_DIFF arguments are datetimes, not dates; Snowflake, Redshift, and Databricks’ DATEDIFF functions support multiple [date types](/sql-reference/data-types#date-data-types) such as dates and timestamps. BigQuery also supports a separate [DATE_DIFF function](https://cloud.google.com/bigquery/docs/reference/standard-sql/date_functions#date_diff) that will return the difference between two date types, unlike the DATETIME_DIFF that only supports the datetime type. + +## A hero in the shadows: The DATEDIFF dbt macro!​ + +You may be able to memorize the syntax for the DATEDIFF function for the primary data warehouse you use. What happens when you switch to a different one for a new job or a new data stack? Remembering if there’s an underscore in the function name or which argument the `` is passed in as is… no fun and leads to the inevitable, countless “datediff in bigquery” Google searches. + +Luckily, [dbt-core](https://github.com/dbt-labs/dbt-core) has your back! dbt Core is the open source dbt product that helps data folks write their [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) following software engineering best practices. + +With dbt v1.2, [adapters](https://docs.getdbt.com/docs/supported-data-platforms) now support [cross-database macros](https://docs.getdbt.com/reference/dbt-jinja-functions/cross-database-macros) to help you write certain functions, like DATE_TRUNC and DATEDIFF, without having to memorize sticky function syntax. + +Using the DATEDIFF macro, you can calculate the difference between two dates without having to worry about finicky syntax. Specifically, this means you could successfully run the same code across multiple databases without having to worry about the finicky differences in syntax. + +Using the [jaffle shop](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql), a simple dataset and dbt project, we can calculate the difference between two dates using the dbt DATEDIFF macro: + +```sql +select + *, + {{ datediff("order_date", "'2022-06-09'", "day") }} +from {{ ref('orders') }} +``` + +This would return all fields from the orders table and the difference in days between order dates and June 9, 2022. + +Under the hood, this macro is taking your inputs and creating the appropriate SQL syntax for the DATEDIFF function *specific to your data warehouse*. diff --git a/website/docs/sql-reference/date-functions/sql-datepart.md b/website/docs/sql-reference/date-functions/sql-datepart.md new file mode 100644 index 00000000000..62743f90ee3 --- /dev/null +++ b/website/docs/sql-reference/date-functions/sql-datepart.md @@ -0,0 +1,64 @@ +--- +id: datepart +title: SQL DATE_PART +description: The DATE_PART function allows you to extract a specified date part from a date/time. Like most other SQL functions, you need to pass in arguments; for the DATE_PART function, you’ll pass in a date/timestamp/date field that you want to extract a date part from and specify the part you want removed. +slug: /sql-reference/datepart +--- + + + Working with SQL DATEPART + + +In this post, we’re going to give a deep dive into the DATE_PART function, how it works, and why we use it. + +The DATE_PART function allows you to extract a specified date part from a date/time. For example, if you were to extract the month from the date February 14, 2022, it would return 2 since February is the second month in the year. + +## How to use the DATE_PART function + +Like most other SQL functions, you need to pass in arguments; for the DATE_PART function, you’ll pass in a date/timestamp/date field that you want to extract a date part from and specify the part you want removed. You can extract the numeric month, date, year, hour, seconds, etc. from a timestamp or date field) using the DATE_PART function using the following syntax: + +`date_part(, )` + +Let’s take a look at a practical example below. + +### DATE_PART function example + +```sql +select + date_part('month', order_date) as order_month, + round(avg(amount)) as avg_order_amount +from {{ ref('orders') }} +group by 1 +``` + +This query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return the rounded order amount per each order month (represented as a numeric value): + +| order_month | avg_order_amount | +|:---:|:---:| +| 1 | 17 | +| 2 | 15 | +| 3 | 18 | +| 4 | 17 | + +Unlike the DATE_TRUNC function that actually truncates a date to its first instance of a given date part (so it maintains a date structure), the DATE_PART function returns a numeric value from a date field. + +You may commonly see the DATE_PART function replaced with an EXTRACT function, which performs the same functionality. + +## DATE_PART function syntax in Snowflake, Databricks, BigQuery, and Redshift + +| Data warehouse | DATE_PART support? | Notes | +|:---:|:---:|:---:| +| Snowflake | ✅ | | +| Databricks | ✅ | | +| Amazon Redshift | ✅ | | +| Google BigQuery | ❌ | BigQuery supports the EXTRACT function which performs the same functionality as the DATE_PART function | +| Postgres | ✅ | This is overly pedantic and you’ll likely never encounter an issue with DATE_PART and EXTRACT evaluating to differences in values that truly matter, but it’s worth noting. Postgres’ DATE_PART and EXTRACT functions would previously evaluate to the same output. However, with Postgres 14, the EXTRACT function now returns a numeric type instead of an 8-byte float. | + +## DATE_PART function use cases + +We most commonly see the DATE_PART or EXTRACT function used in data work to analyze: + +- Fiscal calendars: If your business uses fiscal years, or calendars that differ from the normal 12-month cycle, DATE_PART functions can help create alignment between fiscal calendars and normal calendars +- Ad hoc analysis: The DATE_PART function are useful in ad hoc analyses and queries when you need to look at values grouped by date periods or for period comparisons + +This isn’t an extensive list of where your team may be using the DATE_PART function throughout your dbt models and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. diff --git a/website/docs/sql-reference/joins/sql-cross-join.md b/website/docs/sql-reference/joins/sql-cross-join.md new file mode 100644 index 00000000000..5900ae93256 --- /dev/null +++ b/website/docs/sql-reference/joins/sql-cross-join.md @@ -0,0 +1,76 @@ +--- +id: cross-join +title: SQL CROSS JOIN +description: A cross join typically takes two columns between two database objects and creates a table forming a combination of all rows across joined tables, called a cartesian product. +slug: /sql-reference/cross-join +--- + + + Working with cross joins in SQL + + +A truly rarely seen, but important join: the cross join. The majority of your analytics engineering work will require you to join tables together to create robust, wide tables that will eventually be exposed to end business users. These models will usually be created using mostly [left](/sql-reference/left-join) (and some [inner](/sql-reference/inner-join)) joins. + +A cross join, on the other hand, typically takes two columns between two database objects and creates a table forming a combination of all rows across joined tables, called a cartesian product. Use this page to understand how to use cross joins and where you might leverage them in your dbt project. + +## How to create a cross join + +Unlike regular joins, cross joins don’t use keys to join database objects together: + +``` +select + +from as t1 +cross join as t2 +``` + +Cross joins are one of those SQL concepts that is easier to understand with a tangible example, so let’s jump into it. + +### SQL cross join example + +Table A `date_spine` + +| date | +|:---:| +| 2022-01-01 | +| 2022-01-02 | +| 2022-01-03 | + +Table B `users` + +| user_id | +|:---:| +| 1 | +| 3 | +| 4 | + +```sql +select + users.user_id as user_id, + date.date as date +from {{ ref('users') }} as users +cross join {{ ref('date_spine') }} as date +order by 1 +``` + +This simple query will return a cartesian cross of all users and dates, essentially creating a unique combination of user per date per row: + +| user_id | type | +|:---:|:---:| +| 1 | 2022-01-01 | +| 1 | 2022-01-02 | +| 1 | 2022-01-03 | +| 2 | 2022-01-01 | +| 2 | 2022-01-02 | +| 2 | 2022-01-03 | +| 3 | 2022-01-01 | +| 3 | 2022-01-02 | +| 3 | 2022-01-03 | + +:::tip Generate surrogate keys from cross joins +In the generated table above, the unique key is a combination of the `user_id` and `date` per row. To add a to this table, you could generate a using an MD5 hash the `generate_surrogate_key` macro in dbt-utils (ex. `{{ dbt_utils.generate_surrogate_key(user_id, type) }}` that could eventually be joined onto other tables. +::: + +## SQL cross join use case + +When would the generated table above be useful? Cross joining unique dates and users can be an effective way to create a base table to join various event counts, such as key website, email, or product events, to. These report-type tables are useful to expose to end business users in BI tools to look at aggregate counts per day per user and other useful measures. \ No newline at end of file diff --git a/website/docs/sql-reference/joins/sql-full-outer-join.md b/website/docs/sql-reference/joins/sql-full-outer-join.md new file mode 100644 index 00000000000..8aa8d7f4faa --- /dev/null +++ b/website/docs/sql-reference/joins/sql-full-outer-join.md @@ -0,0 +1,74 @@ +--- +id: outer-join +title: SQL OUTER JOIN +description: A SQL full outer join is a join between two tables that returns all rows from both tables, regardless of join key match success. +slug: /sql-reference/outer-join +--- + + + Working with full outer joins in SQL + + +SQL full outer joins exist and therefore we have to talk about them, but they’re *highly unlikely* to be a join you regularly leverage in your data work. In plain terms, a SQL full outer join is a join between two tables that returns *all rows* from both tables, regardless of join key match success; compare this to [left](/sql-reference/left-join), [inner](/sql-reference/outer-join), or [right joins](/sql-reference/right-join) that require matches to be successful to return certain rows. + +In this page, we’ll unpack how to create a full outer join and demonstrate when you might need one in your analytics engineering work. + +## How to create a full outer join + +Like all joins, you need some database objects (ie tables/views), keys to join on, and a [select statement](/sql-reference/select) to perform a full outer join: + +``` +select + +from as t1 +full outer join as t2 +on t1.id = t2.id +``` + +In this example above, there’s only one field being used to join the table together; if you’re joining between database objects that require multiple fields, you can leverage AND/OR operators, and more preferably, surrogate keys. You may additionally add [WHERE](/sql-reference/where), [GROUP BY](/sql-reference/group-by), [ORDER BY](/sql-reference/order-by), [HAVING](/sql-reference/having), and other clauses after your joins to create filtering, ordering, and performing aggregations. + +A note on full outer joins: it may sound obvious, but because full outer joins can return all rows between two tables, they therefore can return *many* rows, which is not necessarily a recipe for efficiency. When you use full outer joins, you often can find alternatives using different joins or unions to potentially bypass major inefficiencies caused by a full outer join. + +### SQL full outer join example + +Table A `car_type` + +| user_id | car_type | +|:---:|:---:| +| 1 | van | +| 2 | sedan | +| 3 | truck | + +Table B `car_color` + +| user_id | car_color | +|:---:|:---:| +| 1 | red | +| 3 | green | +| 4 | yellow | + +```sql +select + car_type.user_id as user_id, + car_type.car_type as type, + car_color.car_color as color +from {{ ref('car_type') }} as car_type +full outer join {{ ref('car_color') }} as car_color +on car_type.user_id = car_color.user_id +order by 1 +``` + +This simple query will return all rows from tables A and B, regardless of `user_id` match success between the two tables: + +| user_id | type | color | +|:---:|:---:|:---:| +| 1 | van | red | +| 2 | sedan | null | +| 3 | truck | green | +| 4 | null | yellow | + +## SQL full outer join use cases + +There will inevitably be valid use cases for full outer joins in your dbt project. However, because of the nature of dbt, which heavily encourages modularity and dryness, the necessity for full outer joins may go down (slightly). Regardless, the two primary cases for full outer joins we typically see are around consolidating or merging multiple entities together and data validation. +- Merging tables together: A full outer join between two tables can bring those entities together, regardless of join key match. This type of joining can often be bypassed by using different joins, unions, pivots, and a combination of these, but hey, sometimes the full outer join is a little less work 🤷 +- Data validation: Full outer joins can be incredibly useful when performing data validation; for example, in the [dbt-audit-helper package](https://github.com/dbt-labs/dbt-audit-helper), a full outer join is used in the [compare_column_values test](https://github.com/dbt-labs/dbt-audit-helper/blob/main/macros/compare_column_values.sql) to help determine where column values are mismatched between two dbt models. \ No newline at end of file diff --git a/website/docs/sql-reference/joins/sql-inner-join.md b/website/docs/sql-reference/joins/sql-inner-join.md new file mode 100644 index 00000000000..0cf8a3894bd --- /dev/null +++ b/website/docs/sql-reference/joins/sql-inner-join.md @@ -0,0 +1,70 @@ +--- +id: inner-join +title: SQL INNER JOINS +description: An inner join between two database objects returns all rows that have matching join keys; any keys that don’t match are omitted from the query result. +slug: /sql-reference/inner-join +--- + + + Working with inner joins in SQL + + +The cleanest and easiest of SQL joins: the humble inner join. Just as its name suggests, an inner join between two database objects returns all rows that have matching join keys; any keys that don’t match are omitted from the query result. + +## How to create an inner join + +Like all joins, you need some database objects (ie tables/views), keys to join on, and a [select statement](/sql-reference/select) to perform an inner join: + +``` +select + +from as t1 +inner join as t2 +on t1.id = t2.id +``` + +In this example above, there’s only one field from each table being used to join the two together; if you’re joining between two database objects that require multiple fields, you can leverage AND/OR operators, and more preferably, surrogate keys. You may additionally add [WHERE](/sql-reference/where), [GROUP BY](/sql-reference/group-by), [ORDER BY](/sql-reference/order-by), [HAVING](/sql-reference/having), and other clauses after your joins to create filtering, ordering, and performing aggregations. + +As with any query, you can perform as many joins as you want in a singular query. A general word of advice: try to keep data models modular by performing regular audits. If you join certain tables further upstream, are those individual tables needed again further downstream? If your query involves multiple joins and complex logic and is exposed to end business users, ensure that you leverage table or [incremental materializations](https://docs.getdbt.com/docs/build/incremental-models). + +### SQL inner join example + +Table A `car_type` + +| user_id | car_type | +|:---:|:---:| +| 1 | van | +| 2 | sedan | +| 3 | truck | + +Table B `car_color` + +| user_id | car_color | +|:---:|:---:| +| 1 | red | +| 3 | green | +| 4 | yellow | + +```sql +select + car_type.user_id as user_id, + car_type.car_type as type, + car_color.car_color as color +from {{ ref('car_type') }} as car_type +inner join {{ ref('car_color') }} as car_color +on car_type.user_id = car_color.user_id +``` + +This simple query will return all rows that have the same `user_id` in both Table A and Table B: + +| user_id | type | color | +|:---:|:---:|:---:| +| 1 | van | red | +| 3 | truck | green | + +Because there’s no `user_id` = 4 in Table A and no `user_id` = 2 in Table B, rows with ids 2 and 4 (from either table) are omitted from the inner join query results. + +## SQL inner join use cases + +There are probably countless scenarios where you’d want to inner join multiple tables together—perhaps you have some really nicely structured tables with the exact same primary keys that should really just be one larger, wider table or you’re joining two tables together don’t want any null or missing column values if you used a left or right join—it’s all pretty dependent on your source data and end use cases. Where you will not (and should not) see inner joins is in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to improve modularity and DAG cleanliness. + diff --git a/website/docs/sql-reference/joins/sql-left-join.md b/website/docs/sql-reference/joins/sql-left-join.md new file mode 100644 index 00000000000..841edc41cdd --- /dev/null +++ b/website/docs/sql-reference/joins/sql-left-join.md @@ -0,0 +1,76 @@ +--- +id: left-join +title: SQL LEFT JOIN +description: The left join returns all rows in the FROM statement, regardless of match in the left join database object. +slug: /sql-reference/left-join +--- + + + Working with left joins in SQL + + +An analytics engineer favorite: the left join. Without a doubt, this is probably the most regularly used join in any dbt project (and for good reason). + +The left join returns all rows in the [FROM statement](/sql-reference/from), regardless of match in the left join database object. Compare this to an [inner join](/sql-reference/inner-join), where only rows are returned that have successful key matches between the database object in the FROM statement and in the inner join statement. + +## How to create a left join + +Like all joins, you need some database objects (ie tables/views), keys to join on, and a [select statement](/sql-reference/select) to perform a left join: + +``` +select + +from as t1 +left join as t2 +on t1.id = t2.id +``` + +In this example above, there’s only one field from each table being used to join the two together together; if you’re joining between two database objects that require multiple fields, you can leverage AND/OR operators, and more preferably, surrogate keys. You may additionally add [WHERE](/sql-reference/where), [GROUP BY](/sql-reference/group-by), [ORDER BY](/sql-reference/order-by), [HAVING](/sql-reference/having), and other clauses after your joins to create filtering, ordering, and performing aggregations. You may also left (or any join really) as many joins as you’d like in an individual query or . + +### SQL left join example + +Table A `car_type` + +| **user_id** | **car_type** | +|:---:|:---:| +| 1 | van | +| 2 | sedan | +| 3 | truck | + +Table B `car_color` + +| user_id | car_color | +|:---:|:---:| +| 1 | red | +| 3 | green | +| 4 | yellow | + +```sql +select + car_type.user_id as user_id, + car_type.car_type as type, + car_color.car_color as color +from {{ ref('car_type') }} as car_type +left join {{ ref('car_color') }} as car_color +on car_type.user_id = car_color.user_id +``` + +This simple query will return *all rows* from Table A and adds the `color` column to rows where there’s a successful match to Table B: + +| **user_id** | **type** | **color** | +|:---:|:---:|:---:| +| 1 | van | red | +| 2 | sedan | null | +| 3 | truck | green | + +Because there’s no `user_id` = 2 in Table B, there is no `color` available, thus a null result `color` column for `user_id` 2. + +## SQL left join use cases + +Left joins are a fundamental in data modeling and analytics engineering work—they allow you to easily join database objects onto each other while maintaining an original table’s row count (in the from statement). Compared to right joins, that return all rows in a right join database object (and not the from statement), we find left joins a little more intuitive to understand and build off of. + +:::tip Ensure your joins are just ~~left~~ right +Something to note if you use left joins: if there are multiple records for an individual key in the left join database object, be aware that duplicates can potentially be introduced in the final query result. This is where dbt tests, such as testing for uniqueness and [equal row count](https://github.com/dbt-labs/dbt-utils#equal_rowcount-source) across upstream source tables and downstream child models, can help you identify faulty data modeling logic and improve data quality. +::: + +Where you will not (and should not) see left joins is in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) that are used to clean and prep raw source data for analytics uses. Any joins in your dbt projects should happen further downstream in [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts) to improve modularity and cleanliness. \ No newline at end of file diff --git a/website/docs/sql-reference/joins/sql-right-join.md b/website/docs/sql-reference/joins/sql-right-join.md new file mode 100644 index 00000000000..d4d377d1efb --- /dev/null +++ b/website/docs/sql-reference/joins/sql-right-join.md @@ -0,0 +1,72 @@ +--- +id: right-join +title: SQL RIGHT JOIN +description: Right joins return all rows in the right join database object regardless of match in the database object in the FROM statement. +slug: /sql-reference/right-join +--- + + + Working with right joins in SQL + + +The not-as-favorite child: the right join. Unlike [left joins](/sql-reference/left-join) that return all rows in the database object in [the FROM statement](/sql-reference/from), regardless of match in the left join object, right joins return all rows *in the right join database object*, regardless of match in the database object in the FROM statement. + +What you really need to know: You can accomplish anything a right join does with a left join and left joins typically are more readable and intuitive. However, we’ll still walk you through how to use right joins and elaborate on why we think left joins are superior 😉 + +## How to create a right join + +Like all joins, you need some database objects (ie tables/views), keys to join on, and a [select statement](/sql-reference/select) to perform a right join: + +``` +select + +from as t1 +right join as t2 +on t1.id = t2.id +``` + +In this example above, there’s only one field from each table being used to join the two together together; if you’re joining between two database objects that require multiple fields, you can leverage AND/OR operators, and more preferably, surrogate keys. You may additionally add [WHERE](/sql-reference/where), [GROUP BY](/sql-reference/group-by), [ORDER BY](/sql-reference/order-by), [HAVING](/sql-reference/having), and other clauses after your joins to create filtering, ordering, and performing aggregations. You may also right (or any join really) as many joins as you’d like in an individual query or . + +### SQL right join example + +Table A `car_type` + +| **user_id** | **car_type** | +|:---:|:---:| +| 1 | van | +| 2 | sedan | +| 3 | truck | + +Table B `car_color` + +| **user_id** | **car_color** | +|:---:|:---:| +| 1 | red | +| 3 | green | +| 4 | yellow | + +```sql +select + car_type.user_id as user_id, + car_type.car_type as type, + car_color.car_color as color +from {{ ref('car_type') }} as car_type +right join {{ ref('car_color') }} as car_color +on car_type.user_id = car_color.user_id +``` + +This simple query will return *all* rows from Table B and adds the `color` column to rows where there’s a successful match to Table A: + +| **user_id** | **type** | **color** | +|:---:|:---:|:---:| +| 1 | van | red | +| 3 | truck | green | +| 4 | null | yellow | + +Because there’s no `user_id` = 4 in Table A, there is no `type` available, thus a null result `type` column for `user_id` 4. Since no `user_id` = 2 exists in Table B, and that id is not in the right join database object, no rows with a `user_id` of 2 will be returned. + +## SQL right join use cases + +Compared to left joins, you likely won’t see right joins as often (or ever) in data modeling and analytics engineering work. But why not? + +Simply because right joins are a little less intuitive than a left join. When you’re data modeling, you’re usually focused on one database object, and adding the supplementary data or tables you need to give you a final dataset. That one focal database object is typically what is put in the `from {{ ref('my_database_object')}}`; any other columns that are joined onto it from other tables are usually supplementary, but keeping all the rows from the initial table of focus is usually the priority. Don’t get us wrong—right joins can get you there—it’s likely just a little less intuitive and can get complex with queries that involve multiple joins. \ No newline at end of file diff --git a/website/docs/sql-reference/joins/sql-self-join.md b/website/docs/sql-reference/joins/sql-self-join.md new file mode 100644 index 00000000000..0eef0fcab7c --- /dev/null +++ b/website/docs/sql-reference/joins/sql-self-join.md @@ -0,0 +1,71 @@ +--- +id: self-join +title: SQL SELF JOINS +description: A self join allows you to join a dataset back onto itself. A common use cases to leverage a self join is when a table contains a foreign key to the primary key of that same table. +slug: /sql-reference/self-join +--- + + + Working with self joins in SQL + + +Simultaneously the easiest and most confusing of joins: the self join. Simply put, a self join allows you to join a dataset back onto itself. + +If you’re newer to data work or SQL, you may be asking yourself: why in the world would you want to do this? Shouldn’t joins happen between multiple *different* entities? + +The majority of joins you see in analytics work and dbt projects will probably be left and inner joins, but occasionally, depending on how the raw source table is built out, you’ll leverage a self join. One of the most common use cases to leverage a self join is when a table contains a foreign key to the of that same table. + +It’s ok if none of that made sense—jump into this page to better understand how and where you might use a self join in your analytics engineering work. + +## How to create a self join + +No funny venn diagrams here—there’s actually even no special syntax for self joins. To create a self join, you’ll use a regular join syntax, the only differences is the join objects are *the same*: + +``` +select + +from as t1 +[] join as t2 +on t1.id = t2.id +``` + +Since you can choose the dialect of join for a self join, you can specify if you want to do a [left](/sql-reference/left-join), [outer](/sql-reference/outer-join), [inner](/sql-reference/inner-join), [cross](/sql-reference/cross-join), or [right join](/sql-reference/right-join) in the join statement. + +### SQL self join example + +Given a `products` table that looks likes this, where there exists both a primary key (`sku_id`) and foreign key (`parent_id`) to that primary key: + +| **sku_id** | **sku_name** | **parent_id** | +|:---:|:---:|:---:| +| 1 | Lilieth Bed | 4 | +| 2 | Holloway Desk | 3 | +| 3 | Basic Desk | null | +| 4 | Basic Bed | null | + +And this query utilizing a self join to join `parent_name` onto skus: + +```sql +select + products.sku_id, + products.sku_name, + products.parent_id, + parents.sku_name as parent_name +from {{ ref('products') }} as products +left join {{ ref('products') }} as parents +on products.parent_id = parents.sku_id +``` + +This query utilizing a self join adds the `parent_name` of skus that have non-null `parent_ids`: + +| sku_id | sku_name | parent_id | parent_name | +|:---:|:---:|:---:|:---:| +| 1 | Lilieth Bed | 4 | Basic Bed | +| 2 | Holloway Desk | 3 | Basic Desk | +| 3 | Basic Desk | null | null | +| 4 | Basic Bed | null | null | + +## SQL self join use cases + +Again, self joins are probably rare in your dbt project and will most often be utilized in tables that contain a hierarchical structure, such as consisting of a column which is a foreign key to the primary key of the same table. If you do have use cases for self joins, such as in the example above, you’ll typically want to perform that self join early upstream in your , such as in a [staging](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) or [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) model; if your raw, unjoined table is going to need to be accessed further downstream sans self join, that self join should happen in a modular intermediate model. + +You can also use self joins to create a cartesian product (aka a cross join) of a table against itself. Again, slim use cases, but still there for you if you need it 😉 \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-and.md b/website/docs/sql-reference/operators/sql-and.md new file mode 100644 index 00000000000..cceeb4b2300 --- /dev/null +++ b/website/docs/sql-reference/operators/sql-and.md @@ -0,0 +1,59 @@ +--- +id: and +title: SQL AND +description: The AND operator returns results that meet all requirements passed into it. You’ll often see the AND operator used in a WHERE clause to filter query results. +slug: /sql-reference/and +--- + + + Working with the SQL AND operator + + +The AND operator returns results that meet all requirements passed into it; compared to the [OR operator](/sql-reference/or) that only needs to have one true requirement. You’ll often see the AND operator used in a [WHERE clause](/sql-reference/where) to filter query results or in a case statement to create multiple criteria for a result. + +Use this page to understand how to use the AND operator and why it might be helpful in analytics engineering work. + +## How to use the AND operator + +It’s straightforward to use the AND operator, and you’ll typically see it appear in a WHERE clause to filter query results appropriately, in case statements, or joins that involve multiple fields. + +```sql +-- and in a where clause +where and and… + +-- and in a case statement +case when and then … + +-- and in a join +from +join on + = and = +``` + +:::tip Surrogate keys > joins with AND +Using surrogate keys, hashed values of multiple columns, is a great way to avoid using AND operators in joins. Typically, having AND or [OR operators](/sql-reference/or) in a join can cause the query or model to be potentially inefficient, especially at considerable data volume, so creating surrogate keys earlier in your upstream tables ([using the surrogate key macro](https://docs.getdbt.com/blog/sql-surrogate-keys)) can potentially improve performance in downstream models. +::: + +### SQL AND operator example + +```sql +select + order_id, + status, + round(amount) as amount +from {{ ref('orders') }} +where status = 'shipped' and amount > 20 +limit 3 +``` + +This query using the sample dataset Jaffle Shop’s `orders` table will return results where the order status is shipped and the order amount is greater than $20: + +| **order_id** | **status** | **amount** | +|:---:|:---:|:---:| +| 74 | shipped | 30 | +| 88 | shipped | 29 | +| 78 | shipped | 26 | + +## AND operator syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the AND operator with the same syntax for it across each platform. \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-any-all.md b/website/docs/sql-reference/operators/sql-any-all.md new file mode 100644 index 00000000000..250bb9b6ace --- /dev/null +++ b/website/docs/sql-reference/operators/sql-any-all.md @@ -0,0 +1,59 @@ +--- +id: any-all +title: SQL ANY and ALL +description: The ANY operator will return true if any of the conditions passed into evaluate to true, while ALL will only return true if all conditions passed into it are true. +slug: /sql-reference/any-all +--- + + + Working with the SQL ANY and ALL operators + + +The SQL ANY and ALL operators are useful for evaluating conditions to limit query results; they are often passed in with [LIKE](/sql-reference/like) and [ILIKE](/sql-reference/ilike) operators. The ANY operator will return true if any of the conditions passed into evaluate to true, while ALL will only return true if *all* conditions passed into it are true. + +Use this page to better understand how to use ANY and ALL operators, use cases for these operators, and which data warehouses support them. + +## How to use the SQL ANY and ALL operators + +The ANY and ALL operators have very simple syntax and are often passed in the LIKE/ILIKE operator or : + +`where like/ilike any/all (array_of_options)` + +`where = any/all (subquery)` + +Some notes on this operator’s syntax and functionality: +- You may pass in a subquery into the ANY or ALL operator instead of an array of options +- Use the ILIKE operator with ANY or ALL to avoid case sensitivity + +Let’s dive into a practical example using the ANY operator now. + +### SQL ANY example + +```sql +select + order_id, + status +from {{ ref('orders') }} +where status like any ('return%', 'ship%') +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return orders whose status is like the patterns `start with 'return'` or `start with 'ship'`: + +| order_id | status | +|:---:|:---:| +| 18 | returned | +| 23 | return_pending | +| 74 | shipped | + +Because LIKE is case-sensitive, it would not return results in this query for orders whose status were say `RETURNED` or `SHIPPED`. If you have a mix of uppercase and lowercase strings in your data, consider standardizing casing for strings using the [UPPER](/sql-reference/upper) and [LOWER](/sql-reference/lower) functions or use the more flexible ILIKE operator. + +## ANY and ALL syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake and Databricks support the ability to use ANY in a LIKE operator. Amazon Redshift and Google BigQuery, however, do not support the use of ANY in a LIKE or ILIKE operator. Use the table below to read more on the documentation for the ANY operator in your data warehouse. + +| **Data warehouse** | **ANY support?** | **ALL support?** | +|:---:|:---:|:---:| +| [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/like_any.html) | ✅ | ✅ | +| [Databricks](https://docs.databricks.com/sql/language-manual/functions/like.html) | ✅ | ✅ | +| Amazon Redshift | ❌Not supported; consider utilizing multiple OR clauses or [IN operators](/sql-reference/in). | ❌Not supported; consider utilizing multiple [AND clauses](/sql-reference/and) | +| Google BigQuery | ❌Not supported; consider utilizing [multiple OR clauses](https://stackoverflow.com/questions/54645666/how-to-implement-like-any-in-bigquery-standard-sql) or IN operators. | ❌Not supported; consider utilizing multiple AND clauses | \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-between.md b/website/docs/sql-reference/operators/sql-between.md new file mode 100644 index 00000000000..a67b397802a --- /dev/null +++ b/website/docs/sql-reference/operators/sql-between.md @@ -0,0 +1,77 @@ +--- +id: between +title: SQL BETWEEN +description: The SQL BETWEEN condition allows you to specify a range of numerical, date-type, or text values to filter rows on in a query. +slug: /sql-reference/between +--- + + + Working with the SQL BETWEEN operator + + +The SQL BETWEEN condition allows you to specify a range of numerical, date-type, or text values to filter rows on in a query. It’s particularly useful during ad hoc analysis work to narrow query results on a specific data range. + +In this page, we’ll dive into how to use the SQL BETWEEN condition and elaborate on why it might be useful to you. + +## How to use the SQL BETWEEN condition + +The BETWEEN condition has a simple syntax and should be passed in a WHERE clause: + +`where between and ` + +It’s important to note that the BETWEEN condition is inclusive of `beginning_value` and `end_value`. + +Let’s take a look at a practical example using the BETWEEN condition below. + +### SQL BETWEEN example + +```sql +select + customer_id, + order_id, + order_date +from {{ ref('orders') }} +where order_date between '2018-01-01' and '2018-01-31' +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return all rows where the `order_date` falls during January 2018: + +| **customer_id** | **order_id** | **order_date** | +|:---:|:---:|:---:| +| 1 | 1 | 2018-01-01 | +| 3 | 2 | 2018-01-02 | +| 94 | 3 | 2018-01-04 | +| 50 | 4 | 2018-01-05 | +| 64 | 5 | 2018-01-05 | +| 54 | 6 | 2018-01-07 | + +Alternatively, you could build this same query using >/= operators (`where order_date >= 2018-01-01' and order_date <= '2018-01-31'` or `where order_date >= '2018-01-01' and order_date < '2018-02-01'`). + +You may additionally see the NOT clause used in front of BETWEEN to exclude rows that fall between specified ranges. + +## BETWEEN syntax in Snowflake, Databricks, BigQuery, and Redshift + +Most, if not all, modern data warehouses support the BETWEEN condition; the syntax is also the same across them. If your data warehouse does not support the BETWEEN condition, consider using the >/= operators similar to the example outlined above. + +Use the table below to read more on the documentation for the BETWEEN operator in your data warehouse. + +| **Data warehouse** | **BETWEEN support?** | +|:---:|:---:| +| Snowflake | ✅ | +| Databricks | ✅ | +| Amazon Redshift | ✅ | +| Google BigQuery | ✅ | + +## SQL BETWEEN condition use cases + +You’ll most commonly see the BETWEEN condition used in data work to: +- Filter query results to be in a specified date range +- Create buckets for data using case statements, common for bucketing web session engagement or NPS score classification + +```sql +case when time_engaged between 0 and 9 then 'low_engagement' + when time_engaged between 10 and 29 then 'medium_engagement' + else 'high_engagement' end as engagement +``` + +This isn’t an extensive list of where your team may be using the BETWEEN condition throughout your dbt models or ad hoc analyses, but contains some common scenarios analytics engineers may encounter. \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-ilike.md b/website/docs/sql-reference/operators/sql-ilike.md new file mode 100644 index 00000000000..299bfb1c90c --- /dev/null +++ b/website/docs/sql-reference/operators/sql-ilike.md @@ -0,0 +1,65 @@ +--- +id: ilike +title: SQL ILIKE +description: The ILIKE operator helps you easily match, find, and filter out case-insensitive string values of a specified pattern by using SQL wildcards. +slug: /sql-reference/ilike +--- + + + Working with the SQL ILIKE operator + + +The favorite child ILIKE helps you easily match, find, and filter out string values of a specified pattern by using SQL wildcards *without having to worry about case sensitivity*. If you’re a stickler for case-sensitivity, don’t hesitate to use the not-as-special (but still important) child, the LIKE operator 😆 + +## How to use the SQL ILIKE operator + +The ILIKE operator has a simple syntax, with the ability to have it utilized in WHERE clauses or case statements: + +`where ilike ''` or `case when ilike ''` + +Some notes on this operator’s syntax and functionality: +- The `` can use two SQL wildcards (`%` and ` _`); the underscore will match any single character and the % matches zero or more characters + - Ex. '%j' = any string that ends with the letter j + - Ex. 'j%' = any string that starts with a letter j + - Ex. 'j%l' = any string that starts with a the letter j and ends with a letter l + - Ex. '_j%' = any string that has a letter j in the second position +- Majority of use cases for the ILIKE operator will likely involve the `%` wildcard +- The ILIKE operator is case-insensitive, meaning that the casing in the `` you want to filter does not need to match the same-case in your column values +- The ILIKE operator can be paired with the NOT operator, to filter on rows that are not like a specified pattern + +Let’s dive into a practical example using the ILIKE operator now. + +### SQL ILIKE example + +```sql +select + payment_id, + order_id, + payment_method, + case when payment_method ilike '%card' then 'card_payment' else 'non_card_payment' end as was_card +from {{ ref('payments') }} +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `payments` table is creating a new column to determine if a payment used a type of card (ex. debit card, credit card, gift card) payment based on if the `payment_method` value ends in `card`: + +| **payment_id** | **order_id** | **payment_method** | **was_card** | +|:---:|:---:|:---:|:---:| +| 1 | 1 | credit_card | card_payment | +| 9 | 9 | gift_card | card_payment | +| 3 | 3 | coupon | non_card_payment | +| 4 | 4 | coupon | non_card_payment | + +## ILIKE syntax in Snowflake, Databricks, BigQuery, and Redshift + +Most modern data warehouses, with the exception of Google BigQuery, support the ILIKE operator and the syntax is the same across them. Use the table below to read more on the documentation for the ILIKE operator in your data warehouse. + +| **Data warehouse** | **ILIKE support?** | +|:---:|:---:| +| Snowflake | ✅ | +| Databricks | ✅ | +| Amazon Redshift | ✅ | +| Google BigQuery | ❌, recommend using regular expressions or the CONTAINS function | + +## ILIKE operator example use cases + +The ILIKE operator has very similar use cases to the [LIKE operator](/sql-reference/like), so we won’t repeat ourselves here. The important thing to understand when using the LIKE or ILIKE operators is what the casing variations look like in your data: if casing is inconsistent within a column, ILIKE will be your friend; if your backend engineers and analytics engineers rigorously follow a style-guide (and our source data is magically of the same case), the LIKE operator is there for you if you need it. \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-in.md b/website/docs/sql-reference/operators/sql-in.md new file mode 100644 index 00000000000..95321a4d6b6 --- /dev/null +++ b/website/docs/sql-reference/operators/sql-in.md @@ -0,0 +1,50 @@ +--- +id: in +title: SQL IN +description: Read this guide to learn about the SQL IN operator in dbt. +slug: /sql-reference/in +--- + + + Working with the SQL IN Operator + + +It happens to the best of data people: The `orders` table always needs to filter out `status = employee_order` in order to get the accurate order counts. So you’re data model for the `orders` table looks a little something like this: + +```sql +select * from {{ source('backend_db', 'orders') }} +where status != 'employee_order' +``` + +What happens one day if there’s an additional `status` that needs to be filtered out? Well, that’s where the handy IN operator comes into play. + +The IN operator ultimately allows you to specify multiple values in a WHERE clause, so you can easily filter your query on multiple options. Using the IN operator is a more refined version of using multiple OR conditions in a WHERE clause. + +## How to use SQL IN operator + +In the scenario above if you now needed to filter on an additional new `status` value to remove certain rows, your use of the IN operator would look like this: + +```sql +select * from {{ source('backend_db', 'orders') }} +where status not in ('employee_order', 'influencer_order') --list of order statuses to filter out +``` + +Woah woah woah, what is a `not in`? This is exactly what it sounds like: return all rows where the status is not `employee_order` or `influencer_order`. If you wanted to just use the IN operator, you can specify all other statuses that are appropriate (ex. `where status in ('regular_order', 'temp_order')`). + +You can additionally use the IN/NOT IN operator for a subquery, to remove/include rows from a subquery’s result: + +```sql +where status in (select …) +``` + +:::tip Compare columns against appropriate data types +The only “gotcha” that really exists in using the IN operator is remembering that the values in your IN list **must** match the data type of the column they’re compared against. This is especially important for boolean columns that could be accidentally cast as strings. +::: + +## IN operator syntax in Snowflake, Databricks, BigQuery, and Redshift + +The IN operator, like most of the SQL operators, are not syntactically different across data warehouses. That means the syntax for using the IN/NOT IN operator is the same in Snowflake, Databricks, Google BigQuery, and Amazon Redshift. + +## IN operator use cases + +Use the IN condition to filter out inappropriate or inaccurate rows from a query or database schema object based on parameters you define and understand. We guarantee there’s an IN somewhere in your dbt project 😀 diff --git a/website/docs/sql-reference/operators/sql-like.md b/website/docs/sql-reference/operators/sql-like.md new file mode 100644 index 00000000000..3c7d2949ded --- /dev/null +++ b/website/docs/sql-reference/operators/sql-like.md @@ -0,0 +1,71 @@ +--- +id: like +title: SQL LIKE +description: The LIKE operator helps you easily match, find, and filter out case-sensitive string values of a specified pattern by using SQL wildcards. +slug: /sql-reference/like +--- + + + Working with the SQL LIKE operator + + +The LIKE operator helps you easily match, find, and filter out string values of a specified pattern by using SQL wildcards. Important to note that the pattern passed into the LIKE operator is case-sensitive, unlike its case-insensitive cousin, [ILIKE](/sql-reference/ilike). + +## How to use the SQL LIKE operator + +The LIKE operator has a simple syntax, with the ability to have it utilized in [WHERE clauses](/sql-reference/where) or case statements: + +`where like ''` or `case when like ''` + +Some notes on this operator’s syntax and functionality: +- The `` can use two SQL wildcards (`%` and ` _`); the underscore will match any *single character* and the % matches zero or more characters + - Ex. '%J' = any string that ends with a capital J + - Ex. 'J%' = any string that starts with a capital J + - Ex. 'J%L' = any string that starts with a capital J and ends with a capital L + - Ex. '_J%' = any string that has a capital J in the second position +- Majority of use cases for the LIKE operator will likely involve the `%` wildcard +- The LIKE operator is case-sensitive, meaning that the casing in the `` you want to filter for should match the same-case in your column values; for columns with varied casing, leverage the case-insensitive ILIKE operator +- The LIKE operator can be paired with the NOT operator, to filter on rows that are not like a specified pattern + +Let’s dive into a practical example using the LIKE operator now. + +### SQL LIKE example + +```sql +select + user_id, + first_name +from {{ ref('customers') }} +where first_name like 'J%' +order by 1 +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` table would return all of the customers whose first name starts with the *uppercase* letter J: + +| **customer_id** | **first_name** | +|:---:|:---:| +| 1 | Julia | +| 4 | Jeremy | + +Because LIKE is case-sensitive, it would not return results in this query for customers with lowercase J-names. If you have a mix of uppercase and lowercase strings in your data, consider standardizing casing for strings using the [UPPER](/sql-reference/upper) and [LOWER](/sql-reference/lower) functions or use the more flexible [ILIKE operator](/sql-reference/ilike). + +## LIKE syntax in Snowflake, Databricks, BigQuery, and Redshift + +Most, if not all, modern data warehouses support the LIKE operator and the syntax is also the same across them. Some data warehouses, such as Snowflake and Databricks, additionally support similar or more flexible operators such as ILIKE, the case-insensitive version of LIKE, or LIKE ANY, which allows you to pass in multiple pattern options to scan for. + +Use the table below to read more on the documentation for the LIKE operator in your data warehouse. + +| **Data warehouse** | **LIKE support?** | +|:---:|:---:| +| Snowflake | ✅ | +| Databricks | ✅ | +| Amazon Redshift | ✅ | +| Google BigQuery | ✅ | + +## LIKE operator example use cases + +You may see the LIKE operator used in analytics engineering work to: +- Bucket column values together based on general requirements using case statements and the LIKE operator (ex. `case when page_path like '/product%' then 'product_page' else 'non_product_page'`) +- Filter out employee email records based on a similar email address pattern (ex. `where email_address not like '%@dbtlabs.com'`) + +This isn’t an extensive list of where your team may be using the LIKE operator throughout your dbt models, but contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-not.md b/website/docs/sql-reference/operators/sql-not.md new file mode 100644 index 00000000000..e9156cb9720 --- /dev/null +++ b/website/docs/sql-reference/operators/sql-not.md @@ -0,0 +1,58 @@ +--- +id: not +title: SQL NOT +description: The SQL NOT operator allows you to return results from conditions that are not true. he NOT boolean is kind of similar to an adjective—it’s often put in front of another operator. +slug: /sql-reference/not +--- + + + Working with the SQL NOT operator + + +This will be a not *not* useful page on a helpful SQL operator. + +Ok we had to get that out of the way. The SQL NOT operator allows you to return results from conditions that are not true. Pretty intuitive, right? + +In this page, we’ll dive into how to use the NOT operator, demonstrate an example, and elaborate on potential use cases. + +## How to use the SQL NOT operator + +The NOT boolean is kind of similar to an adjective—it’s often put in front of another operator, such as [BETWEEN](/sql-reference/between), [LIKE](/sql-reference/like)/[ILIKE](/sql-reference/ilike), IS, and [IN](/sql-reference/in), to return rows that do not meet the specified criteria. Below is an example of how to use NOT in front of a LIKE operator: + +`where not like ` + +This syntax can be easily modified for other operators: +- `where not between and ` +- `where is not null` +- `where is not in (array_of_options)` +- …or placed altogether in a different place, such as a case statement (ex. `case when is not null then 1 else 0 end`) + +Let’s dive into a practical example using the NOT operator. + +### SQL NOT example + +```sql +select + payment_id, + order_id, + payment_method +from {{ ref('payments') }} +where payment_method not like '%card' +``` + +This simple query using the sample dataset [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `payments` table is returning all rows whose `payment_method` is not a card-type (ex. gift card or credit card): + +| **payment_id** | **order_id** | **payment_method** | +|:---:|:---:|:---:| +| 3 | 3 | coupon | +| 4 | 4 | coupon | +| 5 | 5 | bank_transfer | +| 10 | 9 | bank_transfer | + +## SQL NOT syntax in Snowflake, Databricks, BigQuery, and Redshift + +[Snowflake](https://docs.snowflake.com/en/sql-reference/operators-logical.html), [Databricks](https://docs.databricks.com/sql/language-manual/functions/not.html), [BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/operators), and [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_logical_condition.html) all support the NOT operator, but may not all support secondary operators you would typically use the NOT operator in pair with. For example, `where not ilike ` is valid in Snowflake, Databricks, and Redshift, but the ILIKE operator is not supported in BigQuery, so this example would not be valid across all data warehouses. + +## NOT operator example use cases + +There are probably many scenarios where you’d want to use the NOT operators in your WHERE clauses or case statements, but we commonly see NOT operators used to remove nulls or boolean-identifed deleted rows in source data in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). This removal of unnecessary rows can potentially help the performance of downstream [intermediate](https://docs.getdbt.com/guides/best-practices/how-we-structure/3-intermediate) and [mart models](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts). \ No newline at end of file diff --git a/website/docs/sql-reference/operators/sql-or.md b/website/docs/sql-reference/operators/sql-or.md new file mode 100644 index 00000000000..2319760d0fa --- /dev/null +++ b/website/docs/sql-reference/operators/sql-or.md @@ -0,0 +1,52 @@ +--- +id: or +title: SQL OR +description: Read this guide to learn about the SQL OR operator in dbt. +slug: /sql-reference/or +--- + + + Working with the SQL OR Operator + + +We tried to come up with something witty about using the OR operator in a query, but couldn’t think of any 🤷 + +Use the OR operator in a WHERE clause to filter on multiple field values or perform more advanced joins on multiple fields. + +## How to use the OR operator + +The OR operator is technically a boolean operator—meaning it returns results that execute to true. It’s straightforward to use, and you’ll typically see it appear in a WHERE clause to filter query results appropriately or joins that involve multiple possible fields. + +### OR operator example + +```sql +select + order_id, + customer_id, + order_date, + status, + amount +from {{ ref('orders') }} +where status = 'shipped' or status = 'completed' +limit 3 +``` + +This query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return results where the order status is shipped or completed: + +| order_id | customer_id | order_date | status | amount | +|:---:|:---:|:---:|:---:|:---:| +| 2 | 3 | 2018-01-02 | completed | 20.0000 | +| 3 | 94 | 2018-01-04 | completed | 1.00000 | +| 4 | 50 | 2018-01-05 | completed | 25.0000 | + +## OR operator syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the OR operator with the syntax looking the same in each platform. You may see the OR operator substituted for a more appropriate IN operator. + +## OR use cases + +We most commonly see OR operators used in queries and dbt models to: +- Return results for fields of varying values +- Joining tables on multiple fields using an OR operator (fair warning: this can be a bit scary and inefficient, so use OR operators in joins very carefully and consider refactoring your work to avoid these scenarios) + +This isn’t an extensive list of where your team may be using OR throughout your data work, but it contains some common scenarios analytics engineers face day-to-day. diff --git a/website/docs/sql-reference/other/sql-cast.md b/website/docs/sql-reference/other/sql-cast.md new file mode 100644 index 00000000000..cf24a12706e --- /dev/null +++ b/website/docs/sql-reference/other/sql-cast.md @@ -0,0 +1,69 @@ +--- +id: cast +title: SQL CAST +description: Executing the CAST function in a SELECT statement will return the column you specified as the newly specified data type. +slug: /sql-reference/cast +--- + + + Working with the SQL CAST function + + +Let’s set the scene: You are knee-deep in a new data model and cannot figure out why the join between `user_id` in` table a` is not successfully joining with the `user_id` in `table b`. You dig a little deeper and discover that `user_id` in `table a` is an integer and `user_id` in `table b` is a string. + +*Cue throwing hands in the air.* + +It *will* happen: You’ll find column types in your source data or upstream models that will likely need to be cast into different data types; perhaps to make joins easier, calculations more intuitive, or data more readable. Regardless of the reason, you’ll find yourself inevitably casting some data as an analytics engineer and using the SQL CAST function to help you out. + +## How to use SQL CAST function + +The syntax for using the CAST function looks like the following: + +```sql +cast( as ) +``` + +Executing this function in a SELECT statement will return the column you specified as the newly specified data type. Analytics engineers will typically be casting fields to more appropriate or useful numeric, strings, and date types. You may additionally use the CAST function in WHERE clauses and in joins. + +Below, we’ll walk through a practical example using the CAST function. + +### SQL CAST function example + +You can cast the `order_id` and `customer_id` fields of the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` model from number types to strings using the following code: + +```sql +select + cast(order_id as string) as order_id, + cast(customer_id as string) as customer_id, + order_date, + status +from {{ ref('orders') }} +``` + +After running this query, the `orders` table will look a little something like this: + +| order_id | customer_id | order_date | status | +|---|---|---|---| +| 1 | 1 | 2018-01-01 | returned | +| 2 | 3 | 2018-01-02 | completed | +| 3 | 94 | 2018-01-04 | completed | + +Let’s be clear: the resulting data from this query looks exactly the same as the upstream `orders` model. However, the `order_id` and `customer_id` fields are now strings, meaning you could easily concat different string variables to them. + +> Casting columns to their appropriate types typically happens in our dbt project’s [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). A few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. + +## SQL CAST function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Google BigQuery, Amazon Redshift, Snowflake, Postgres, and Databricks all support the ability to cast columns and data to different types. In addition, the syntax to cast is the same across all of them using the CAST function. + +You may also see the CAST function replaced with a double colon (::), followed by the data type to convert to; `cast(order_id as string)` is the same thing as `order_id::string` in most data warehouses. + +## CAST function use cases + +You know at one point you’re going to need to cast a column to a different data type. But what are the scenarios folks run into that call for these conversions? At their core, these conversions need to happen because raw source data doesn’t match the analytics or business use case. This typically happens for a few reasons: + +- Differences in needs or miscommunication from [backend developers](https://docs.getdbt.com/blog/when-backend-devs-spark-joy#signs-the-data-is-sparking-joy) +- tools [defaulting to certain data types](https://airbytehq.github.io/integrations/sources/google-sheets/) +- BI tools require certain fields to be specific data types + +A key thing to remember when you’re casting data is the user experience in your end BI tool: are business users expecting `customer_id` to be filtered on 1 or '1'? What is more intuitive for them? If one `id` field is an integer, all `id` fields should be integers. Just like all data modeling, consistency and standardization is key when determining when and what to cast. \ No newline at end of file diff --git a/website/docs/sql-reference/other/sql-comments.md b/website/docs/sql-reference/other/sql-comments.md new file mode 100644 index 00000000000..811f2b4339e --- /dev/null +++ b/website/docs/sql-reference/other/sql-comments.md @@ -0,0 +1,61 @@ +--- +id: comments +title: SQL Comments +description: Inline SQL comments will begin with two dashes (--) in front of them in a query or dbt model; any text following these dashes is therefore what you’d call “commented out.” For longer, multi-line comments, you’ll typically see this syntax `/* your multi-line comment here */` used. +slug: /sql-reference/comments +--- + + + Working with the SQL Comments + + +SQL comments…a two-folded thing: Are we talking about comments *inline* in SQL? Or comments on a table or view in the database? + +Why not both!? + +In this page, we’ll unpack how to create both inline and database object-level comments, general best practices around SQL comments, and how dbt can help you improve (and version-control) your comments. + +## How to create SQL comments + +Inline SQL comments will begin with two dashes (--) in front of them in a query or dbt model; any text following these dashes is therefore what you’d call “commented out.” For longer, multi-line comments, you’ll typically see this syntax `/* your multi-line comment here */` used. + +### SQL comment example + +```sql +/* these lines form a multi-line SQL comment; if it’s uncommented, +it will make this query error out */ +select + customer_id, + -- order_id, this row is commented out + order_date +from {{ ref ('orders') }} +``` + +In practice, you’ll likely see SQL comments at the beginning of complex code logic, to help future developers or even advanced business users understand what specific blocks of code are accomplishing. Other times, you’ll see comments like the code above, that are commenting out lines no longer needed (or in existence) for that query or model. We’ll dive more into best practices around inline comments later on this page. + +For comments *on* database objects, such as views and tables, there’s a different syntax to add these explicit comments: + +```sql +comment on [database object type] is 'comment text here'; +``` + +These database object-level comments are more useful for adding additional context or metadata to these objects versus inline comments being useful for explaining code functionality. Alternatively, these table and view-level comments can be easily abstracted out and version-controlled using [model descriptions in dbt](https://docs.getdbt.com/reference/resource-properties/description) and persisted in the objects using the [persist_docs config](/reference/resource-configs/persist_docs) in dbt. + +## SQL comments in Snowflake, Databricks, BigQuery, and Redshift + +Google BigQuery, Amazon Redshift, Snowflake, and Databricks all support the ability to add inline SQL comments. With the exception of BigQuery, these data warehouses also support native database object-level comments; BigQuery does, however, support native field-level descriptions. + +## SQL commenting best practices + +In general, inline SQL comments should be used thoughtfully; another analytics engineer should be able to pair your comments with your code to clearly understand model functionality. + +We recommend leveraging inline comments in the following situations: + +- Explain complex code logic that if you had to scratch your head at, someone else will have to scratch their head at +- Explain niche, unique-to-your-business logic +- Separate out field types (ex. Ids, booleans, strings, dates, numerics, and timestamps) in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging) to create more readable, organized, and formulaic models +- Clearly label tech debt (`-- [TODO]: TECH DEBT`) in queries or models + + +If you find your inline SQL comments are getting out of control, less scannable and readable, that’s a sign to lean more heavily on dbt Docs and markdown files in your dbt project. dbt supports [descriptions](https://docs.getdbt.com/reference/resource-properties/description), which allow you to add robust model (or macro, source, snapshot, seed, and source) and column descriptions that will populate in hosted dbt Docs. For models or columns that need more thorough or customizable documentation, leverage [doc blocks in markdown and YAML files](https://docs.getdbt.com/reference/resource-properties/description#use-a-docs-block-in-a-description) to create more detailed explanations and comments. + diff --git a/website/docs/sql-reference/statements/sql-case-statement.md b/website/docs/sql-reference/statements/sql-case-statement.md new file mode 100644 index 00000000000..135e0c06bf1 --- /dev/null +++ b/website/docs/sql-reference/statements/sql-case-statement.md @@ -0,0 +1,75 @@ +--- +id: case +title: SQL CASE WHEN +description: CASE statements allow you to cascade through multiple scenarios (or cases) in your data, evaluate them if they’re true, and output a corresponding value for each case. +slug: /sql-reference/case +--- + + + Working with the SQL CASE statements + + +SQL case statements are the backbone of analytics engineers and dbt projects. They help add context to data, make fields more readable or usable, and allow you to create specified buckets with your data. + +To informally formalize it, case statements are the SQL equivalent of an if-then statement in other programming languages. They allow you to cascade through multiple scenarios (or cases) in your data, evaluate them if they’re true, and output a corresponding value for each case. + +In this page, we’ll break down how to use SQL case statements and demonstrate why they’re valuable to modern data teams. + +## How to use the SQL case statements + +Case when statements are created in [SELECT statements](/sql-reference/select) along with other fields you choose to select. The general syntax for SQL case when statements is as follows: + +```sql +case when [scenario 1] then [result 1] + when [scenario 2] then [result 2] + -- …as many scenarios as you want + when [scenario n] then [result n] + else [fallback result] -- this else is optional +end as +``` + +Some notes on case statement functionality: +- Scenarios in case statements are *evaluated in the order they’re listed*. What does this mean? It means that if multiple scenarios evaluate to true, the earliest listed true scenario is the one whose result is returned. +- The results in each scenario need to be of the same data type; if scenario 1 results in a string, all other scenarios need to be [strings](/sql-reference/strings). +- Oftentimes data teams will omit a final `else` scenario since the `else [fallback result]`is optional and defaulted to `else null`. +- In general, case statement performance in select statements is relatively efficient (compared to other SQL functionality like aggregates or clunky joins involving ANDs and ORs); this isn’t to say it’s efficient (or smart) to be comparing a ton of scenarios, but it likely won’t be the bottleneck in your data models. +- Case when statement results can also be passed into aggregate functions, such as [MAX](/sql-reference/max), [MIN](/sql-reference/min), and [COUNT](/sql-reference/count), or even date functions (ex. `date_trunc('month', `) + +Below, let’s take a look at a practical example using a case statement. + +### SQL CASE WHEN example + +```sql +select + order_id, + round(amount) as amount, + case when amount between 0 and 10 then 'low' + when amount between 11 and 20 then 'medium' + else 'high' + end as order_value_bucket +from {{ ref('orders') }} +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return a new field that buckets order amount based on criteria: + +| **order_id** | **amount** | **order_value_bucket** | +|:---:|:---:|:---:| +| 1 | 10 | low | +| 2 | 20 | medium | +| 3 | 1 | low | +| 4 | 25 | high | +| 5 | 17 | medium | + +## SQL CASE WHEN syntax in Snowflake, Databricks, BigQuery, and Redshift + +Since it’s a fundamental of SQL, most, if not all, modern data warehouses support the ability to add case when statements to their queries. Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support case statements and have the same syntax for them. + +## CASE WHEN use cases + +The use cases for case statements in dbt models and ad hoc queries is almost endless; as a result, we won’t (be able to) create an exhaustive list of where you might see case statements in the wild. + +Instead, it’s important to know *why* you’d want to use them in your data work and when you wouldn’t want to use them. Some example reasons you’d want to leverage case statements: +- Create booleans from your existing data (ex. `case when cnt > 1 then true else false end as is_active`) +- Establish mappings between raw data and more general buckets of data (see example earlier in the page); note that if you find yourself creating many case when scenarios for a mapping that doesn’t change over time, you’ll likely want to import that mapping either as its own dbt model or data source (a good use case for [seeds](https://docs.getdbt.com/docs/build/seeds)) +- If you find yourself creating the same case when statement throughout your models, consider abstracting that case when into its own model or into a [macro](https://docs.getdbt.com/docs/build/jinja-macros) +- Generate more business-user friendly columns values that can be easily comprehended by business users diff --git a/website/docs/sql-reference/statements/sql-distinct.md b/website/docs/sql-reference/statements/sql-distinct.md new file mode 100644 index 00000000000..55ef330479e --- /dev/null +++ b/website/docs/sql-reference/statements/sql-distinct.md @@ -0,0 +1,60 @@ +--- +id: distinct +title: SQL DISTINCT +description: Using DISTINCT in a SELECT statement will force a query to only return non-duplicate rows. You may commonly see a DISTINCT clause in COUNT functions to get counts of distinct rows. +slug: /sql-reference/distinct +--- + + + Working with the SQL DISTINCT + + +Let’s just put it out there: at one point in your data work, you’ll encounter duplicates in your data. They may be introduced from a faulty data source or created during the joining and transforming of data. You may need a more sophisticated or refactored solution for the latter scenario, but it never hurts to know how to use DISTINCT in a query. + +Using DISTINCT in a SELECT statement will force a query to only return non-duplicate rows. You may commonly see a DISTINCT clause in COUNT functions to get counts of distinct rows. + +## How to use SQL DISTINCT in a query + +To remove duplicate rows from a query, you add DISTINCT immediately after SELECT followed by the rows you want to be selected: + +```sql +select + distinct + row_1, + row_2 +from my_data_source +``` + +Let’s take a look at a practical example using DISTINCT below. + +### SQL DISTINCT example + +```sql +select + count(customer_id) as cnt_all_orders, + count(distinct customer_id) as cnt_distinct_customers +from {{ ref('orders') }} +``` + +This simple query is something you may do while doing initial exploration of your data; it will return the count of `customer_ids` and count of distinct `customer_ids` that appear in the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table: + +| cnt_all_orders | cnt_distinct_customers | +|:---:|:---:| +| 99 | 62 | + +As you can see from the query results, there are 99 orders placed by customers, but only 62 distinct customers in the table. + +## DISTINCT syntax in Snowflake, Databricks, BigQuery, and Redshift + +Since it’s a pillar of SQL, all modern data warehouses support the ability to use DISTINCT in a SELECT statement 😀 + +## DISTINCT use cases + +You’ll most commonly see queries using a DISTINCT statement to: + +- Remove unnecessary duplicate rows from a data model; a word of caution on this: if you need to use DISTINCT in a downstream, non-source model that contains joins, there’s a chance that there could be faulty logic producing duplicates in the data, so always double-check that they are true duplicates. + +- Find the counts of distinct fields in a dataset, especially for primary or surrogate keys. + +This isn’t an extensive list of where your team may be using DISTINCT throughout your development work, dbt models, and BI tool logic, but it contains some common scenarios analytics engineers face day-to-day. + diff --git a/website/docs/sql-reference/statements/sql-from.md b/website/docs/sql-reference/statements/sql-from.md new file mode 100644 index 00000000000..615e7a7314c --- /dev/null +++ b/website/docs/sql-reference/statements/sql-from.md @@ -0,0 +1,49 @@ +--- +id: from +title: SQL FROM +description: The SQL FROM statement allows you to identify the database schema object (table/view) you want to select data from in a query. +slug: /sql-reference/from +--- + + + Working with SQL FROM statements + + +What makes the analytics world go ‘round? Queries and bad graphs. (Since we’re here to keep it brief, we won’t go into the latter here 😉) + +The first thing someone learns in SQL: how to build a query using [SELECT](/sql-reference/select) and FROM statements. The SQL FROM statement is the fundamental building block of any query: it allows you to identify the database schema object (table/view) you want to select data from in a query. + +In a dbt project, a SQL dbt model is technically a singular SELECT statement (often built leveraging CTEs or subqueries) using a [reference](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) to an upstream data model or table in a FROM statement. + +## How to use SQL FROM statements + +Any query begins with a simple SELECT statement and wrapped up with a FROM statement: + +```sql +select + order_id, --select your columns + customer_id, + order_date +from {{ ref('orders') }} --the table/view/model you want to select from +limit 3 +``` + +Woah woah woah! That is not the typical FROM statement you’re probably used to seeing! + +Most FROM statements in the non-dbt world, such as when you’re running ad-hoc queries directly in your data warehouse, will follow the `FROM database.schema.table_name` syntax. In dbt projects, analytics engineers leverage [the ref statement](https://docs.getdbt.com/reference/dbt-jinja-functions/ref) to refer to other data models and sources to automatically build a dependency graph and avoid having to hard-code schema names. This flexibility is valuable as analytics engineers develop in their own development environments (schemas) without having to rename tables in their FROM statements. + +This basic query is selecting three columns from the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql) `orders` table and returning three rows. If you execute this query in your data warehouse, it will return a result looking like this: + +| **order_id** | **customer_id** | **order_date** | +|:---:|:---:|:---:| +| 1 | 1 | 2018-01-01 | +| 2 | 3 | 2018-01-02 | +| 3 | 95 | 2018-01-04 | + +In the query above, dbt automatically compiles the `from {{ ref('orders') }}` to `from analytics.jaffle_shop.orders` when the query is sent down to the data warehouse and run in the production environment. + +If you’re selecting from multiple tables or models, that’s where you’d rely on unions or joins to bring multiple tables together in a way that makes sense to your data. + +## FROM statement syntax in Snowflake, Databricks, BigQuery, and Redshift + +Just as the humble SELECT statement is a SQL fundamental that goes untampered by the data warehouses, FROM syntax does not vary within them. As a result, writing the actual `select…from` statement across Snowflake, Databricks, Google BigQuery, and Amazon Redshift would look the same. \ No newline at end of file diff --git a/website/docs/sql-reference/statements/sql-group-by.md b/website/docs/sql-reference/statements/sql-group-by.md new file mode 100644 index 00000000000..b6a9a37effe --- /dev/null +++ b/website/docs/sql-reference/statements/sql-group-by.md @@ -0,0 +1,73 @@ +--- +id: group-by +title: SQL GROUP BY +description: The GROUP BY statement allows you to group query results by specified columns and is used in pair with aggregate functions such as AVG and SUM to calculate those values across specific rows. +slug: /sql-reference/group-by +--- + + + Working with the SQL GROUP BY statement + + +GROUP BY…it’s a little hard to explicitly define in a way *that actually makes sense*, but it will inevitably show up countless times in analytics work and you’ll need it frequently. + +To put it in the simplest terms, the GROUP BY statement allows you to group query results by specified columns and is used in pair with aggregate functions such as [AVG](/sql-reference/avg) and [SUM](/sql-reference/sum) to calculate those values across specific rows. + +## How to use the SQL GROUP BY statement + +The GROUP BY statement appears at the end of a query, after any joins and [WHERE](/sql-reference/where) filters have been applied: + +```sql +select + my_first_field, + count(id) as cnt --or any other aggregate function (sum, avg, etc.) +from my_table +where my_first_field is not null +group by 1 --grouped by my_first_field +order by 1 desc +``` + +A few things to note about the GROUP BY implementation: +- It’s usually listed as one of the last rows in a query, after any joins or where statements; typically you’ll only see [HAVING](/sql-reference/having), [ORDER BY](/sql-reference/order-by), or [LIMIT](/sql-reference/limit) statements following it in a query +- You can group by multiple fields (ex. `group by 1,2,3`) if you need to; in general, we recommend performing aggregations and joins in separate CTEs to avoid having to group by too many fields in one query or CTE +- You may also group by explicit column name (ex. `group by my_first_field`) or even a manipulated column name that is in the query (ex. `group by date_trunc('month', order_date)`) + +:::note Readability over DRYness? +Grouping by explicit column name (versus column number in query) can be two folded: on one hand, it’s potentially more readable by end business users; on the other hand, if a grouped column name changes, that name change needs to be reflected in the group by statement. Use a grouping convention that works for you and your data, but try to keep to one standard style. +::: + +### SQL GROUP BY example + +```sql +select + customer_id, + count(order_id) as num_orders +from {{ ref('orders') }} +group by 1 +order by 1 +limit 5 +``` + +This simple query using the sample dataset [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `order` table will return customers and the count of orders they’ve placed: + +| customer_id | num_orders | +|:---:|:---:| +| 1 | 2 | +| 2 | 1 | +| 3 | 3 | +| 6 | 1 | +| 7 | 1 | + +Note that the `order by` and `limit` statements are after the `group by` in the query. + +## SQL GROUP BY syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, BigQuery, and Redshift all support the ability to group by columns and follow the same syntax. + +## GROUP BY use cases + +Aggregates, aggregates, and did we mention, aggregates? GROUP BY statements are needed when you’re calculating aggregates (averages, sum, counts, etc.) by specific columns; your query will not run successfully without them if you’re attempting to use aggregate functions in your query. You may also see GROUP BY statements used to deduplicate rows or join aggregates onto other tables with CTEs; [this article provides a great writeup](https://www.getdbt.com/blog/write-better-sql-a-defense-of-group-by-1/) on specific areas you might see GROUP BYs used in your dbt projects and data modeling work. + +:::tip 👋Bye bye finicky group bys +In some sticky data modeling scenarios, you may find yourself needing to group by many columns to collapse a table down into fewer rows or deduplicate rows. In that scenario, you may find yourself writing `group by 1, 2, 3,.....,n` which can become tedious, confusing, and difficult to troubleshoot. Instead, you can leverage a [dbt macro](https://github.com/dbt-labs/dbt-utils#group_by-source) that will save you from writing `group by 1,2,....,46` to instead a simple `{{ dbt_utils.group_by(46) }}`...you’ll thank us later 😉 +::: diff --git a/website/docs/sql-reference/statements/sql-select.md b/website/docs/sql-reference/statements/sql-select.md new file mode 100644 index 00000000000..49132524096 --- /dev/null +++ b/website/docs/sql-reference/statements/sql-select.md @@ -0,0 +1,49 @@ +--- +id: select +title: SQL SELECT +description: The SQL SELECT statement is the fundamental building block of any query that allows you to select specific columns from a database schema object. +slug: /sql-reference/select +--- + + + Working with SQL SELECT statements + + +My goodness, would there even be modern data teams without SQL SELECT statements? Probably not. + +Luckily, we live in a world of tabular data, cloud data warehouses, and SQL prowess. Analysts and analytics engineers are writing queries, creating data models, and leveraging SQL to power their [data transformations](https://www.getdbt.com/analytics-engineering/transformation/) and analysis. But what makes these queries possible? SELECT statements. + +The SQL SELECT statement is the fundamental building block of any query: it allows you to select specific columns (data) from a database schema object (table/view). In a dbt project, a SQL dbt model is technically a singular SELECT statement (often built leveraging CTEs or subqueries). + +## How to use SELECT + +Any query begins with a simple SELECT statement: + +```sql +select + order_id, --your first column you want selected + customer_id, --your second column you want selected + order_date --your last column you want selected (and so on) +from {{ ref('orders') }} --the table/view/model you want to select from +limit 3 +``` + +This basic query is selecting three columns from the [jaffle shop’s](https://github.com/dbt-labs/jaffle_shop/blob/main/models/orders.sql) `orders` table and returning three rows. If you execute this query in your data warehouse, it will return a result looking like this: + +| order_id | customer_id | order_date | +|:---:|:---:|:---:| +| 1 | 1 | 2018-01-01 | +| 2 | 3 | 2018-01-02 | +| 3 | 95 | 2018-01-04 | + +You may also commonly see queries that `select * from table_name`. The asterisk or star is telling the compiler to select all columns from a specified table or view. + +:::tip Goodbye carpal tunnel +Leverage [dbt utils’ star macro](/blog/star-sql-love-letter) to be able to both easily select many and specifically exclude certain columns. +::: + +In a dbt project, analytics engineers will typically write models that contain multiple CTEs that build to one greater query. For folks that are newer to analytics engineering or dbt, we recommend they check out the [“How we structure our dbt projects” guide](/guides/best-practices/how-we-structure/1-guide-overview) to better understand why analytics folks like modular data modeling and CTEs. + +## SELECT statement syntax in Snowflake, Databricks, BigQuery, and Redshift + +While we know the data warehouse players like to have their own slightly different flavors and syntax for SQL, they have conferred together that the SELECT statement is sacred and unchangeable. As a result, writing the actual `select…from` statement across Snowflake, Databricks, Google BigQuery, and Amazon Redshift would look the same. However, the actual SQL manipulation of data within the SELECT statement (ex. adding dates, casting columns) might look slightly different between each data warehouse. \ No newline at end of file diff --git a/website/docs/sql-reference/string-functions/sql-concat.md b/website/docs/sql-reference/string-functions/sql-concat.md new file mode 100644 index 00000000000..2317ee5aca8 --- /dev/null +++ b/website/docs/sql-reference/string-functions/sql-concat.md @@ -0,0 +1,52 @@ +--- +id: concat +title: SQL CONCAT +description: The CONCAT function allows analytics engineers to join multiple string values in a query. +slug: /sql-reference/concat +--- + + + Working with the SQL CONCAT + + +There is no better or simpler way to join multiple string values in a query than by using the CONCAT function. Full stop. + +It’s a straightforward function with pretty straightforward use cases. Use this page to understand how to use the CONCAT function in your data warehouse and why analytics engineers use it throughout their dbt models. + +## How to use the CONCAT function + +Using the CONCAT function is pretty straightforward: you’ll pass in the strings or binary values you want to join together in the correct order into the CONCAT function. You can pass in as many expressions into the CONCAT function as you would like. + +### CONCAT function example + +```sql +select + user_id, + first_name, + last_name, + concat(first_name, ' ', last_name) as full_name +from {{ ref('customers') }} +limit 3 +``` + +This query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` table will return results like this with a new column of the combined `first_name` and `last_name` field with a space between them: + +| user_id | first_name | last_name | full_name | +|:---:|:---:|:---:|:---:| +| 1 | Michael | P. | Michael P. | +| 2 | Shawn | M. | Shawn M. | +| 3 | Kathleen | P. | Kathleen P. | + +## CONCAT function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Snowflake, Databricks, Google BigQuery, and Amazon Redshift all support the CONCAT function with the syntax looking the same in each platform. You may additionally see the CONCAT function represented by the `||` operator (ex. `select first_name || last_name AS full_name from {{ ref('customers') }}`) which has the same functionality as the CONCAT function in these data platforms. + +## CONCAT use cases + +We most commonly see concatenation in SQL for strings to: + +- Join together address/geo columns into one field +- Add hard-coded string values to columns to create clearer column values +- Create surrogate keys using a hashing method and multiple column values (ex. `md5(column_1 || column_2) as unique_id` + +This isn’t an extensive list of where your team may be using CONCAT throughout your data work, but it contains some common scenarios analytics engineers face day-to-day. \ No newline at end of file diff --git a/website/docs/sql-reference/string-functions/sql-lower.md b/website/docs/sql-reference/string-functions/sql-lower.md new file mode 100644 index 00000000000..8c8622bb77a --- /dev/null +++ b/website/docs/sql-reference/string-functions/sql-lower.md @@ -0,0 +1,68 @@ +--- +id: lower +title: SQL LOWER +description: Using the LOWER function on a string value will return the input as an all-lowercase string. It’s an effective way to create consistent capitalization for string values across your data. +slug: /sql-reference/lower +--- + + + Working with the SQL LOWER function + + +We’ve all been there: +- In a user signup form, user A typed in their name as `Kira Furuichi`, user B typed it in as `john blust`, and user C wrote `DAvid KrevitT` (what’s up with that, David??) +- Your backend application engineers are adamant customer emails are in all caps +- All of your event tracking names are lowercase + +In the real world of human imperfection, opinions, and error, string values are likely to take inconsistent capitalization across different data sources (or even within the same data source). There’s always a little lack of rhyme or reason for why some values are passed as upper or lowercase, and it’s not worth the headache to unpack that. + +So how do you create uniformity for string values that you collect across all your data sources? The LOWER function! + +Using the LOWER function on a string value will return the input as an all-lowercase string. It’s an effective way to create consistent capitalization for string values across your data. + +## How to use the SQL LOWER function + +The syntax for using the LOWER function looks like the following: + +```sql +lower() +``` + +Executing this command in a SELECT statement will return the lowercase version of the input string. You may additionally use the LOWER function in WHERE clauses and on join values. + +Let’s take a look at a practical example using the LOWER function. + +### SQL LOWER function example + +You can lower the first name and last name of the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` model using the following code: + +```sql +select + customer_id, + lower(first_name) as first_name, + lower(last_name) as last_name +from {{ ref('customers') }} +``` + +After running this query, the `customers` table will look a little something like this: + +| customer_id | first_name | last_name | +|---|---|---| +| 1 | michael | p. | +| 2 | shawn | m. | +| 3 | kathleen | p. | + +Now, all characters in the `first_name` and `last_name` columns are lowercase. + +> Changing all string columns to lowercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lowercasing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. + +## SQL LOWER function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Google BigQuery, Amazon Redshift, Snowflake, Postgres, and Databricks all support the LOWER function. In addition, the syntax to use LOWER is the same across all of them. + + +## LOWER function use cases + +Let’s go back to our chaotic trio of users A, B, and C who all used different capitalizations to type in their names. If you don’t create consistent capitalization for string values, how would a business user know what to filter for in their BI tool? A business user could filter a name field on “John Blust” since that’s what they would expect it to look like, only to get zero results back. By creating a consistent capitalization format (upper or lowercase) for all string values in your data models, you therefore create some expectations for business users in your BI tool. + +There will most likely never be 100% consistency in your data models, but doing all that you can to mitigate that chaos will make your life and the life of your business users hopefully a little easier. Use the LOWER function to create a consistent casing for all strings in your data sources. diff --git a/website/docs/sql-reference/string-functions/sql-trim.md b/website/docs/sql-reference/string-functions/sql-trim.md new file mode 100644 index 00000000000..ad54a015437 --- /dev/null +++ b/website/docs/sql-reference/string-functions/sql-trim.md @@ -0,0 +1,53 @@ +--- +id: trim +title: SQL TRIM +description: The SQL TRIM function removes the leading and trailing characters of a string. By default, it removes the blank space character from the beginning and end of a string. +slug: /sql-reference/trim +--- + + + Working with the SQL TRIM function + + +We’ve been there: pesky blank spaces, weird, inconsistent formats, or unaccountable asterisks hiding at the end of your column value—[strings](/sql-reference/strings) are one of the most variable data types in your datasets. They likely lack a uniform casing, vary in length, and will inevitably have characters you need to trim from them. + +Introducing: the SQL TRIM function, which removes the leading and trailing characters of a string. By default, it removes the blank space character from the beginning and end of a string. + +## How to use the SQL TRIM function + +The syntax for using TRIM function looks like the following: + +```sql +trim( [, ]) +``` + +Like we said earlier, the default `` is a blank space, such that if you were to `trim(' string with extra leading space')` it would return `'string with extra leading space'`. You can explicitly specify single characters or a pattern to trim from your strings. + +### SQL TRIM function example + +```sql +select + first_name, + concat('*', first_name, '**') as test_string, + trim(test_string, '*') as back_to_first_name +from {{ ref('customers') }} +limit 3 +``` + +After running this query, the resulting `orders` table will look like this: + +| first_name | test_string | back_to_first_name | +|---|---|---| +| Julia | *Julia** | Julia | +| Max | *Max** | Max | +| Laura | *Laura** | Laura | + +In this query, you’re adding superfluous asterisks to a string using the [CONCAT function](/sql-reference/concat) and recleaning it using the TRIM function. Even though I specified one asterisk in the TRIM function itself, it recognizes that as a pattern to remove from the beginning and end of a string, which is why the double asterisks (**) were removed from the end of the `test_string` column. + +## SQL TRIM function syntax in Snowflake, Databricks, BigQuery, and Redshift + +[Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#trim), [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_TRIM.html), [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/trim.html), and [Databricks](https://docs.databricks.com/sql/language-manual/functions/trim.html) all support the ability to use the TRIM function. In addition, the syntax to trim strings is the same across all of them using the TRIM function. These data warehouses also support the RTRIM and LTRIM functions, which allow you to only trim characters from the right side and left side of a string, respectively. + +## TRIM function use cases + +If string values in your raw data have extra white spaces or miscellaneous characters, you’ll leverage the TRIM (and subset RTRIM AND LTRIM) functions to help you quickly remove them. You’ll likely do this cleanup in [staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging), where you’re probably standardizing casing and doing other minor formatting changes to string values, so you can use a clean and consistent format across your downstream models. diff --git a/website/docs/sql-reference/string-functions/sql-upper.md b/website/docs/sql-reference/string-functions/sql-upper.md new file mode 100644 index 00000000000..cf7694f8e46 --- /dev/null +++ b/website/docs/sql-reference/string-functions/sql-upper.md @@ -0,0 +1,61 @@ +--- +id: upper +title: SQL UPPER +description: Using the UPPER function on a string value will return the input as an all-uppercase string. It’s an effective way to create expected capitalization for certain string values across your data. +slug: /sql-reference/upper +--- + + + Working with the SQL UPPER function + + +UPPER is the counterpart to [LOWER](/sql-reference/lower) (who would have guessed?)—and they’re probably the most intuitive of SQL functions. + +Using the UPPER function on a string value will return the input as an all-uppercase string. It’s an effective way to create expected capitalization for certain string values across your data. + +## How to use the SQL UPPER function + + The syntax for using the UPPER function looks like the following: + +```sql +upper() +``` +Executing this command in a SELECT statement will return the uppercase version of the input string value. You may additionally use the UPPER function in WHERE clauses and on join values. + +Below, we’ll walk through a practical example using the UPPER function. + +### SQL UPPER function example + +You can uppercase the first name of the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `customers` model using the following code: + +```sql +select + customer_id, + upper(first_name) as first_name, + last_name +from {{ ref('customers') }} +``` + +After running this query, the `customers` table will look a little something like this: + +| customer_id | first_name | last_name | +|---|---|---| +| 1 | MICHAEL | P. | +| 2 | SHAWN | M. | +| 3 | KATHLEEN | P. | + +Now, all characters in the `first_name` are uppercase (and `last_name` are unchanged). + +> Changing string columns to uppercase to create uniformity across data sources typically happens in our [dbt project’s staging models](https://docs.getdbt.com/guides/best-practices/how-we-structure/2-staging). There are a few reasons for that: data cleanup and standardization, such as aliasing, casting, and lower or upper casing, should ideally happen in staging models to create downstream uniformity and improve downstream performance. + +## SQL UPPER function syntax in Snowflake, Databricks, BigQuery, and Redshift + +Google BigQuery, Amazon Redshift, Snowflake, Postgres, and Databricks all support the UPPER function. In addition, the syntax to use the UPPER function is the same across all of them. + +## UPPER function use cases + +By creating a consistent capitalization format (upper or lowercase) for all string values in your data models, you therefore create some expectations for business users in your BI tool. +- Uppercase country codes in data sources to meet user expectations +- Create a consistent capitalization format for string values in your data models, also creating expectations for business users in your BI tool + +There will most likely never be 100% consistency in your data models, but doing all that you can to mitigate that chaos will make your life and the life of your business users hopefully a little easier. Use the UPPER function to create a consistent casing for all strings in your data sources. diff --git a/website/docs/sql-reference/window-functions/sql-rank.md b/website/docs/sql-reference/window-functions/sql-rank.md new file mode 100644 index 00000000000..c06ec2c8a75 --- /dev/null +++ b/website/docs/sql-reference/window-functions/sql-rank.md @@ -0,0 +1,77 @@ +--- +id: rank +title: SQL RANK +description: The RANK function returns the rank of a value (starting at 1) in an ordered group or dataset. +slug: /sql-reference/rank +--- + + + Working with the SQL RANK + + +There are many different ranking window functions…[ROW_NUMBER](/sql-reference/row-number), DENSE_RANK, RANK. Let’s start off with the most basic (RANK) and talk about what it is, how to use it, and why it’s important in analytics engineering work. + +The RANK function is an effective way to create a ranked column or filter a query based on rankings. More specifically, the RANK function returns the rank of a value (starting at 1) in an ordered group or dataset. It's important to note that if multiple values executed by the rank function are the same, they’ll have the same rank. + +## How to use the RANK function + +The RANK function has a pretty simple syntax, with an optional partition field and support for ordering customization: + +`rank() over ([partition by ] order by field(s) [asc | desc])` + +Some notes on this function’s syntax: + +- The `partition by` field is optional; if you want to rank your entire dataset by certain fields (compared to partitioning *and ranking* within a dataset), you would simply omit the `partition by` from the function call (see the example below for this). +- By default, the ordering of a ranking function is set to ascending. To explicitly make the ordering in a descending order, you’ll need to pass in `desc` to the `order by` part of the function. + +Let’s take a look at a practical example using the RANK function below. + +### RANK function example + +```sql +select + order_id, + order_date, + rank() over (order by order_date) as order_rank +from {{ ref('orders') }} +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return the rank of orders by their `order_date`: + +| order_id | order_date | order_rank | +|:---:|:---:|:---:| +| 1 | 2018-01-01 | 1 | +| 2 | 2018-01-02 | 2 | +| 3 | 2018-01-04 | 3 | +| 4 | 2018-01-05 | 4 | +| 5 | 2018-01-05 | 4 | +| 6 | 2018-01-07 | 6 | + +Some notes on these results: + +- Orders that have the same `order_date`(ex. Orders 4 and 5) have the same `order_rank` (4). +- Order 6’s `order_rank` is 6 (if you wanted the rank to execute to 5, you would use the DENSE_RANK function). + +:::tip Ranking functions to know +RANK is just one of the ranking functions that analytics engineering practitioners will use throughout their data models. There’s also DENSE_RANK and [ROW_NUMBER](/sql-reference/row-number) which rank rows differently than RANK. +::: + +## RANK syntax in Snowflake, Databricks, BigQuery, and Redshift + +Most, if not all, modern data warehouses support RANK and other similar ranking functions; the syntax is also the same across them. Use the table below to read more on the documentation for the RANK function in your data warehouse. + +| Data warehouse | RANK support? | +|:---:|:---:| +| Snowflake | ✅ | +| Databricks | ✅ | +| Amazon Redshift | ✅ | +| Google BigQuery | ✅ | + +## RANK function use cases + +We most commonly see the RANK function used in data work to: + +- In [SELECT statements](/sql-reference/select) to add explicit ranking to rows +- In QUALIFY statements to filter a query on a ranking without having to add the rank to the query result + +This isn’t an extensive list of where your team may be using the RANK function throughout your dbt models and BI tool logic, but contains some common scenarios analytics engineers face in a day-to-day. diff --git a/website/docs/sql-reference/window-functions/sql-row-number.md b/website/docs/sql-reference/window-functions/sql-row-number.md new file mode 100644 index 00000000000..22cec8d79e8 --- /dev/null +++ b/website/docs/sql-reference/window-functions/sql-row-number.md @@ -0,0 +1,74 @@ +--- +id: row-number +title: SQL ROW_NUMBER +description: The ROW_NUMBER function returns the unique row number of a row in an ordered group or dataset. +slug: /sql-reference/row-number +--- + + + Working with the SQL ROW_NUMBER + + +In this page, let’s go deep into the ROW_NUMBER function and talk about what it is, how to use it, and why it’s important in analytics engineering work. + +The ROW_NUMBER window function is an effective way to create a ranked column or filter a query based on rankings. More specifically, the ROW_NUMBER function returns the *unique* row number of a row in an ordered group or dataset. + +Unlike the [RANK](/sql-reference/rank) and DENSE_RANK functions, ROW_NUMBER is non-deterministic, meaning that a *unique* number is assigned arbitrarily for rows with duplicate values. + +## How to use the ROW_NUMBER function + +The ROW_NUMBER function has a pretty simple syntax, with an optional partition field and support for ordering customization: + +`row_number() over ([partition by ] order by field(s) [asc | desc])` + +Some notes on this function’s syntax: + +- The `partition by` field is optional; if you want to get the row numbers of your entire dataset (compared to grabbing row number within a group of rows in your dataset), you would simply omit the `partition by` from the function call (see the example below for this). +- By default, the ordering of a ROW_NUMBER function is set to ascending. To explicitly make the resulting order descending, you’ll need to pass in `desc` to the `order by` part of the function. + +Let’s take a look at a practical example using the ROW_NUMBER function below. + +### ROW_NUMBER function example + +```sql +select + customer_id, + order_id, + order_date, + row_number() over (partition by customer_id order by order_date) as row_n +from {{ ref('orders') }} +order by 1 +``` + +This simple query using the [Jaffle Shop’s](https://github.com/dbt-labs/jaffle_shop) `orders` table will return the unique row number per customer by their `order_date`: + +| customer_id | order_id | order_date | row_n | +|:---:|:---:|:---:|:---:| +| 1 | 1 | 2018-01-01 | 1 | +| 1 | 37 | 2018-02-10 | 2 | +| 2 | 8 | 2018-01-11 | 1 | +| 3 | 2 | 2018-01-02 | 1 | +| 3 | 24 | 2018-01-27 | 2 | +| 3 | 69 | 2018-03-11 | 3 | + +Because ROW_NUMBER is non-deterministic, orders per customer that have the same `order_date` would have unique `row_n` values (unlike if you used the RANK or DENSE_RANK functions). + +## ROW_NUMBER syntax in Snowflake, Databricks, BigQuery, and Redshift + +Most, if not all, modern data warehouses support ROW_NUMBER and other similar ranking functions; the syntax is also the same across them. Use the table below to read more on the documentation for the ROW_NUMBER function in your data warehouse. + +| Data warehouse | ROW_NUMBER support? | +|:---:|:---:| +| [Snowflake](https://docs.snowflake.com/en/sql-reference/functions/row_number.html) | ✅ | +| [Databricks](https://docs.databricks.com/sql/language-manual/functions/row_number.html) | ✅ | +| [Amazon Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_WF_ROW_NUMBER.html) | ✅ | +| [Google BigQuery](https://cloud.google.com/bigquery/docs/reference/standard-sql/numbering_functions#row_number) | ✅ | + +## ROW_NUMBER function use cases + +We most commonly see the ROW_NUMBER function used in data work to: + +- In [SELECT statements](/sql-reference/select) to add explicit and unique row numbers in a group of data or across an entire table +- Paired with QUALIFY statement, filter CTEs, queries, or models to capture one unique row per specified partition with the ROW_NUMBER function. This is particularly useful when you need to remove duplicate rows from a dataset (but use this wisely!). + +This isn’t an extensive list of where your team may be using the ROW_NUMBER function throughout your dbt models, but contains some common scenarios analytics engineers face day-to-day. diff --git a/website/docs/terms/cte.md b/website/docs/terms/cte.md index 2489b623b7d..d4a4bb15915 100644 --- a/website/docs/terms/cte.md +++ b/website/docs/terms/cte.md @@ -66,7 +66,7 @@ When people talk about how CTEs can simplify your queries, they specifically mea #### Establish Structure -In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/docs/get-started/learning-more/refactoring-legacy-sql#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. +In leveraging CTEs, you can break complex code into smaller segments, ultimately helping provide structure to your code. At dbt Labs, we often like to use the [import, logical, and final structure](/guides/migration/tools/refactoring-legacy-sql#implement-cte-groupings) for CTEs which creates a predictable and organized structure to your dbt models. #### Easily identify dependencies @@ -147,7 +147,7 @@ Your results from running this query look a little like this: :::tip Tip If you are finding yourself using the same code for a certain CTE across multiple -queries or models, that’s probably a good sign that CTE should be its own [model](https://docs.getdbt.com/docs/building-a-dbt-project/building-models) or view. +queries or models, that’s probably a good sign that CTE should be its own [model](https://docs.getdbt.com/docs/build/models) or view. ::: ## CTE vs Subquery @@ -181,7 +181,7 @@ CTEs are essentially temporary views that can be used throughout a query. They a If you’re interested in reading more about CTE best practices, check out some of our favorite content around model refactoring and style: -- [Refactoring Legacy SQL to dbt](/docs/get-started/learning-more/refactoring-legacy-sql#implement-cte-groupings) +- [Refactoring Legacy SQL to dbt](/guides/migration/tools/refactoring-legacy-sql#implement-cte-groupings) - [dbt Labs Style Guide](https://github.com/dbt-labs/corp/blob/main/dbt_style_guide.md#ctes) - [Modular Data Modeling Technique](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) diff --git a/website/docs/terms/dag.md b/website/docs/terms/dag.md index b3cd550e580..f4247c785a4 100644 --- a/website/docs/terms/dag.md +++ b/website/docs/terms/dag.md @@ -16,7 +16,7 @@ While the concept of a DAG originated in mathematics and gained popularity in co ## DAG use cases and best practices -DAGs are an effective tool to help you understand relationships between your data models and areas of improvement for your overall data transformations. +DAGs are an effective tool to help you understand relationships between your data models and areas of improvement for your overall [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). ### Unpacking relationships and data lineage @@ -39,7 +39,7 @@ What can you learn from this DAG? Immediately, you may notice a handful of thing - `stg_users`and `stg_user_groups` models are the parent models for `int_users` - A join is happening between `stg_users` and `stg_user_groups` to form the `int_users` model - `stg_orgs` and `int_users` are the parent models for `dim_users` -- `dim_users` is a the end of the DAG and is therefore downstream from a total of four different models +- `dim_users` is at the end of the DAG and is therefore downstream from a total of four different models Within 10 seconds of looking at this DAG, you can quickly unpack some of the most important elements about a project: dependencies and data lineage. Obviously, this is a simplified version of DAGs you may see in real life, but the practice of identifying relationships and data flows remains very much the same, regardless of the size of the DAG. @@ -79,7 +79,7 @@ Instead of manually auditing your DAG for best practices, the [dbt project evalu ## dbt and DAGs -The marketing team at dbt Labs would be upset with us if we told you we think dbt actually stood for “dag build tool,” but one of the key elements of dbt is its ability to generate documentation and infer relationships between models. And one of the hallmark features of [dbt Docs](https://docs.getdbt.com/docs/building-a-dbt-project/documentation) is the Lineage Graph (DAG) of your dbt project. +The marketing team at dbt Labs would be upset with us if we told you we think dbt actually stood for “dag build tool,” but one of the key elements of dbt is its ability to generate documentation and infer relationships between models. And one of the hallmark features of [dbt Docs](https://docs.getdbt.com/docs/collaborate/documentation) is the Lineage Graph (DAG) of your dbt project. Whether you’re using dbt Core or Cloud, dbt docs and the Lineage Graph are available to all dbt developers. The Lineage Graph in dbt Docs can show a model or source’s entire lineage, all within a visual frame. Clicking within a model, you can view the Lineage Graph and adjust selectors to only show certain models within the DAG. Analyzing the DAG here is a great way to diagnose potential inefficiencies or lack of modularity in your dbt project. @@ -97,7 +97,7 @@ This level of interconnectivity and transparency can help boost data governance ## Conclusion -A Directed acyclic graphs (DAG) is a visual representation of your data models and their connection to each other. The key components of a DAG are that nodes (sources/models/exposures) are directionally linked and don’t form acyclic loops. Overall, DAGs are an effective tool for understanding data lineage, dependencies, and areas of improvement in your data models. +A Directed acyclic graph (DAG) is a visual representation of your data models and their connection to each other. The key components of a DAG are that nodes (sources/models/exposures) are directionally linked and don’t form acyclic loops. Overall, DAGs are an effective tool for understanding data lineage, dependencies, and areas of improvement in your data models. > *Get started with [dbt today](https://www.getdbt.com/signup/) to start building your own DAG!* @@ -108,4 +108,4 @@ Ready to restructure (or create your first) DAG? Check out some of the resources - [Data modeling techniques for more modularity](https://www.getdbt.com/analytics-engineering/modular-data-modeling-technique/) - [How we structure our dbt projects](https://docs.getdbt.com/guides/best-practices/how-we-structure/1-guide-overview) - [How to audit your DAG](https://www.youtube.com/watch?v=5W6VrnHVkCA) -- [Refactoring legacy SQL to dbt](/docs/get-started/learning-more/refactoring-legacy-sql) +- [Refactoring legacy SQL to dbt](/guides/migration/tools/refactoring-legacy-sql) diff --git a/website/docs/terms/data-catalog.md b/website/docs/terms/data-catalog.md index c618a16cbfc..feb529e82e6 100644 --- a/website/docs/terms/data-catalog.md +++ b/website/docs/terms/data-catalog.md @@ -75,7 +75,7 @@ Do you have sophisticated data models and sources that the business knows how to Is the culture data-driven? If business users are caught up in their own processes like Google spreadsheets, they may not even utilize a catalog. You don’t want to pay for a tool that is too sophisticated for where the business and data teams currently stand. Don’t rush the process. ### Data catalog tools -Data teams may choose to use third-party tools with data cataloging capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Metaphor](https://support.metaphor.io/hc/en-us/articles/9302185081627), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), [Castor](https://docs.castordoc.com/integrations/dbt), and others. These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data cataloging and lineage capabilities. +Data teams may choose to use third-party tools with data cataloging capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Metaphor](https://support.metaphor.io/hc/en-us/articles/9302185081627), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/), [Castor](https://docs.castordoc.com/integrations/dbt), and others. These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data cataloging and lineage capabilities. ## Conclusion @@ -84,4 +84,4 @@ Data catalogs are a valuable asset to any data team and business as a whole. The ## Additional reading - [Why both data cataloging?](https://www.getdbt.com/analytics-engineering/transformation/data-catalog/) -- [Glossary: Data warehouse](/terms/data-warehouse) \ No newline at end of file +- [Glossary: Data warehouse](/terms/data-warehouse) diff --git a/website/docs/terms/data-extraction.md b/website/docs/terms/data-extraction.md index 9a53562c97f..bc37b68cf66 100644 --- a/website/docs/terms/data-extraction.md +++ b/website/docs/terms/data-extraction.md @@ -23,7 +23,7 @@ There are two primary ways modern data teams are using to extract data: - Data lineage: What it is and how to automate it - dbt Labs + What is data lineage? And how do you get started? Data lineage provides a holistic view of how data moves through an organization, where it’s transformed and consumed. Overall, data lineage is a fundamental concept to understand in the practice of analytics engineering and modern data work. @@ -21,11 +21,11 @@ This holistic view of the data pipeline allows data teams to build, troubleshoot We’ll unpack why data lineage is important, how it works in the context of analytics engineering, and where some existing challenges still exist for data lineage. -## **Why is data lineage important?** +## Why is data lineage important? As a data landscape grows in size and complexity, the benefits of data lineage become more apparent. For data teams, the three main advantages of data lineage include reducing root-cause analysis headaches, minimizing unexpected downstream headaches when making upstream changes, and empowering business users. -### **Root cause analysis** +### Root cause analysis It happens: dashboards and reporting fall victim to data pipeline breaks. Data teams quickly need to diagnose what’s wrong, fix where things may be broken, and provide up-to-date numbers to their end business users. But when these breaks happen (and they surely do) how can teams quickly identify the root cause of the problem? @@ -33,13 +33,13 @@ If data teams have some form of data lineage in place, they can more easily iden Will a data lineage or a DAG solve your breaking pipelines? Definitely not. Will it potentially make your life easier to find problems in your data work? Heck yes. -### **Downstream impacts on upstream changes** +### Downstream impacts on upstream changes You may have been here—your backend engineering team drops the `customers` table to create a newer, more accurate `users` table. The only bad thing is…[they forgot to tell the data team about the change](https://docs.getdbt.com/blog/when-backend-devs-spark-joy). When you have a data lineage system, you can visually see which downstream models, nodes, and exposures are impacted by big upstream changes such as source or model renaming or removals. Referring to your DAG or data lineage system before any significant change to your analytics work is a great way to help prevent accidental downstream issues. -### **Value to business users** +### Value to business users While data lineage makes it easier for data teams to manage pipelines, stakeholders and leaders also benefit from data lineage, primarily around promoting data transparency into the data pipelines. @@ -55,17 +55,17 @@ Spotlighting redundant data models can help trim down on WET (write every time/w Overall, data lineage and data-driven business go hand-in-hand. A data lineage system allows data teams to be more organized and efficient, business users to be more confident, and data pipelines to be more modular. -## **How does data lineage work?** +## How does data lineage work? In the greater data world, you may often hear of data lineage systems based on tagging, patterns or parsing-based systems. In analytics engineering however, you’ll often see data lineage implemented in a DAG or through third-party tooling that integrates into your data pipeline. -### **DAGs (directed acyclic graphs)** +### DAGs (directed acyclic graphs) -If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your data transformations. +If you use a transformation tool such as dbt that automatically infers relationships between data sources and models, a DAG automatically populates to show you the lineage that exists for your [data transformations](https://www.getdbt.com/analytics-engineering/transformation/). - + -Your DAG is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. +Your is used to visually show upstream dependencies, the nodes that must come before a current model, and downstream relationships, the work that is impacted by the current model. DAGs are also directional—they show a defined flow of movement and form non-cyclical loops. Ultimately, DAGs are an effective way to see relationships between data sources, models, and dashboards. DAGs are also a great way to see visual bottlenecks, or inefficiencies in your data work (see image below for a DAG with...many bottlenecks). Data teams can additionally add [meta fields](https://docs.getdbt.com/reference/resource-configs/meta) and documentation to nodes in the DAG to add an additional layer of governance to their dbt project. @@ -77,15 +77,15 @@ DAGs shouldn’t be dependent on manual updates. Instead, your DAG should be aut ::: -### **Third-party tooling** +### Third-party tooling -Data teams may also choose to use third-party tools  with  lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud) or [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. +Data teams may also choose to use third-party tools with lineage capabilities such as [Atlan](https://ask.atlan.com/hc/en-us/articles/4433673207313-How-to-set-up-dbt-Cloud), Alation, [Collibra](https://marketplace.collibra.com/listings/dbt-lineage-to-collibra-integration/), [Datafold](https://www.datafold.com/column-level-lineage), Metaphor, [Monte Carlo](https://docs.getmontecarlo.com/docs/dbt-cloud), [Select Star](https://docs.selectstar.com/integrations/dbt/dbt-cloud), or [Stemma](https://docs.stemma.ai/docs/stemma/getting-started/what-we-need-from-you/dbt-integration/). These tools often integrate directly with your data pipelines and dbt workflows and offer zoomed-in data lineage capabilities such as column-level or business logic-level lineage. -## **The challenges of data lineage** +## Data lineage challenges The biggest challenges around data lineage become more apparent as your data, systems, and business questions grow. -### **Scaling data pipelines** +### Data lineage challenge #1: Scaling data pipelines As dbt projects scale with data and organization growth, the number of sources, models, macros, seeds, and [exposures](https://docs.getdbt.com/docs/build/exposures) invariably grow. And with an increasing number of nodes in your DAG, it can become harder to audit your DAG for WET code or inefficiencies. @@ -97,7 +97,7 @@ Is your DAG keeping up with best practices? Instead of manually auditing your DA ::: -### **Column-level lineage** +### Data lineage challenge #2: Column-level lineage Complex workflows also add to the difficulties a data lineage system will encounter. For example, consider the challenges in describing a data source's movement through a pipeline as it's filtered, pivoted, and joined with other tables. These challenges increase when the granularity of the data lineage shifts from the table to the column level. diff --git a/website/docs/terms/data-wrangling.md b/website/docs/terms/data-wrangling.md index 49d5054073b..a5b4e99f312 100644 --- a/website/docs/terms/data-wrangling.md +++ b/website/docs/terms/data-wrangling.md @@ -63,7 +63,7 @@ If you're struggling to do all the cleaning on your own, remember that dbt packa Enriching your data means enhancing it by supplementing incomplete or missing data. This could involve basic case or coalesce statements that use an already existing column in order to produce a new column. It could also look like joining an already existing date column with a date table that contains more extensive information about a certain date. Keep in mind that you don’t want to go overboard with enriching or joining here—you only want to add what will be repeatedly used in modeling and analysis. :::tip Python for enrichment? -With the new capability of [Python in dbt](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models), will folks start using Python to help enrich their data? Only time will tell, but we’re eager to hear how you want to be using Python in dbt. Please join the [#dbt-core-python-models channel](https://www.getdbt.com/community/join-the-community/) to join in on the discussions happening around them. +With the new capability of [Python in dbt](/docs/build/python-models), will folks start using Python to help enrich their data? Only time will tell, but we’re eager to hear how you want to be using Python in dbt. Please join the [#dbt-core-python-models channel](https://www.getdbt.com/community/join-the-community/) to join in on the discussions happening around them. ::: ### Validating @@ -151,7 +151,7 @@ For nested data types such as JSON, you’ll want to check out the JSON parsing ### Validating -dbt offers [generic tests](https://docs.getdbt.com/docs/building-a-dbt-project/tests#more-generic-tests) in every dbt project that allows you to validate accepted, unique, and null values. They also allow you to validate the relationships between tables and that the primary key is unique. +dbt offers [generic tests](/docs/build/tests#more-generic-tests) in every dbt project that allows you to validate accepted, unique, and null values. They also allow you to validate the relationships between tables and that the primary key is unique. If you can’t find what you need with the generic tests, you can download an additional dbt testing package called [dbt_expectations](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) that dives even deeper into how you can test the values in your columns. This package has useful tests like `expect_column_values_to_be_in_type_list`, `expect_column_values_to_be_between`, and `expect_column_value_lengths_to_equal`. diff --git a/website/docs/terms/dataframe.md b/website/docs/terms/dataframe.md index 8981c8e4648..5e74a4c7668 100644 --- a/website/docs/terms/dataframe.md +++ b/website/docs/terms/dataframe.md @@ -74,10 +74,10 @@ You could probably write hundreds of pages on DataFrame use cases and examples, Let’s just say it: there’s a lot of things you can do in Python that could do in SQL and vice versa, but Python packages typically win out when it comes to data enrichment. A typical use case for Python DataFrames is the ability to apply Python libraries or functions to data in the DataFrame. -In practice, this could look like applying an [IP parser](https://pypi.org/project/ipparser/) to an IP address column, using a package to determine whether a [date falls on a holiday](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models#using-pypi-packages), or leveraging [numpy](https://numpy.org/) for performant and complex mathematical computations. +In practice, this could look like applying an [IP parser](https://pypi.org/project/ipparser/) to an IP address column, using a package to determine whether a [date falls on a holiday](/docs/build/python-models#using-pypi-packages), or leveraging [numpy](https://numpy.org/) for performant and complex mathematical computations. :::tip dbt x Python DataFrames -With v1.3, dbt now supports the use of beta [Python models in dbt](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models). What does this mean exactly? This means that Python-defined data transformations can be created and used in a dbt project in the same vein as a classic dbt SQL model. These Python models are incredibly new and the team is eagerly looking for feedback in how folks want to use and ritualize them. +With v1.3, dbt now supports the use of beta [Python models in dbt](/docs/build/python-models). What does this mean exactly? This means that Python-defined data transformations can be created and used in a dbt project in the same vein as a classic dbt SQL model. These Python models are incredibly new and the team is eagerly looking for feedback in how folks want to use and ritualize them. ::: ### Manipulation of data during extraction and loading scripts @@ -102,6 +102,6 @@ A DataFrame is a tabular data storage format in Python that is widely used acros Are you ready to dip your toes in DataFrames, Python, and dbt? Check out some of the resources below to learn more about how dbt is embracing Python: -- [Python models in dbt](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/python-models) +- [Python models in dbt](/docs/build/python-models) - #beta-feedback-python-models Slack channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/) - [Best practices for developing Python models in dbt discussion](https://github.com/dbt-labs/docs.getdbt.com/discussions/1811) \ No newline at end of file diff --git a/website/docs/terms/deploying.md b/website/docs/terms/deploying.md new file mode 100644 index 00000000000..53e59658142 --- /dev/null +++ b/website/docs/terms/deploying.md @@ -0,0 +1,12 @@ +--- +id: deploying +title: Deploying +description: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. +displayText: Deploying +hoverSnippet: Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. +--- + +Deploying dbt in production means setting up a system to run a dbt job on a schedule, rather than running dbt commands manually from the command line. For more details, refer to [Deploy dbt jobs](/docs/deploy/deployments). + + + diff --git a/website/docs/terms/dimensional-modeling.md b/website/docs/terms/dimensional-modeling.md index f54f601b7d3..d0b5e9384a5 100644 --- a/website/docs/terms/dimensional-modeling.md +++ b/website/docs/terms/dimensional-modeling.md @@ -71,7 +71,7 @@ Following the example from above, a dimension table for this business would look In this table, each account only has one row. If an account’s name or status were to be updated, new values would overwrite existing records versus appending new rows. :::tip Snapshots -For fact tables you want to keep track of changes to, folks can leverage [dbt snapshots](https://docs.getdbt.com/docs/building-a-dbt-project/snapshots). +For fact tables you want to keep track of changes to, folks can leverage [dbt snapshots](/docs/build/snapshots). ::: ### Facts and dimensions at play with each other @@ -137,7 +137,7 @@ The benefits and drawbacks of dimensional modeling are pretty straightforward. G * **More accessibility**: Since the output of good dimensional modeling is a [data mart](https://docs.getdbt.com/guides/best-practices/how-we-structure/4-marts), the tables created are easier to understand and more accessible to end consumers. * **More flexibility**: Easy to slice, dice, filter, and view your data in whatever way suits your purpose. -* **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. +* **Performance**: Fact and dimension models are typically materialized as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models). Since these often form the core understanding of a business, they are queried often. Materializing them as tables allows them to be more performant in downstream BI platforms. The disadvantages include: * **Navigating ambiguity**: You need to rely on your understanding of your data and stakeholder wants to model your data in a comprehensible and useful way. What you know about your data and what people really need out of the data are two of the most fundamental and difficult things to understand and balance as a data person. diff --git a/website/docs/terms/dml.md b/website/docs/terms/dml.md index fb8a0f6d610..7af15ba94e7 100644 --- a/website/docs/terms/dml.md +++ b/website/docs/terms/dml.md @@ -85,7 +85,7 @@ UPDATE orders SET status = 'returned' WHERE order_id = 7; ``` :::tip Tip -The `UPDATE` statement is often compared to the `MERGE` statement. With `MERGE` statements, you can insert, update, *and* delete records in a single command. Merges are often utilized when there is data between two tables that needs to be reconciled or updated. You'll see merges most commonly executed when a source table is updated and a downstream table needs to be updated as a result of this change. Learn more about [how dbt uses merges in incremental models here](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models#how-do-incremental-models-work-behind-the-scenes). +The `UPDATE` statement is often compared to the `MERGE` statement. With `MERGE` statements, you can insert, update, *and* delete records in a single command. Merges are often utilized when there is data between two tables that needs to be reconciled or updated. You'll see merges most commonly executed when a source table is updated and a downstream table needs to be updated as a result of this change. Learn more about [how dbt uses merges in incremental models here](https://docs.getdbt.com/docs/build/incremental-models#how-do-incremental-models-work-behind-the-scenes). ::: ## Conclusion diff --git a/website/docs/terms/dry.md b/website/docs/terms/dry.md index 0a0d6f14393..be3d03ed4f0 100644 --- a/website/docs/terms/dry.md +++ b/website/docs/terms/dry.md @@ -12,7 +12,7 @@ hoverSnippet: DRY is a software development principle that stands for “Don’t DRY is a software development principle that stands for “Don’t Repeat Yourself.” Living by this principle means that your aim is to reduce repetitive patterns and duplicate code and logic in favor of modular and referenceable code. -The DRY code principle was originally made with software engineering in mind and coined by Andy Hunt and Dave Thomas in their book, _The Pragmatic Programmer_. They believed that “every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As the field of analytics engineering and data transformation develops, there’s a growing need to adopt [software engineering best practices](https://www.getdbt.com/product/what-is-dbt/), including writing DRY code. +The DRY code principle was originally made with software engineering in mind and coined by Andy Hunt and Dave Thomas in their book, _The Pragmatic Programmer_. They believed that “every piece of knowledge must have a single, unambiguous, authoritative representation within a system.” As the field of analytics engineering and [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) develops, there’s a growing need to adopt [software engineering best practices](https://www.getdbt.com/product/what-is-dbt/), including writing DRY code. ## Why write DRY code? @@ -26,7 +26,7 @@ WET, which stands for “Write Everything Twice,” is the opposite of DRY. It's Well, how would you know if your code isn't DRY enough? That’s kind of subjective and will vary by the norms set within your organization. That said, a good rule of thumb is [the Rule of Three](https://en.wikipedia.org/wiki/Rule_of_three_(writing)#:~:text=The%20rule%20of%20three%20is,or%20effective%20than%20other%20numbers.). This rule states that the _third_ time you encounter a certain pattern, you should probably abstract it into some reusable unit. -There is, of course, a tradeoff between simplicity and conciseness in code. The more abstractions you create, the harder it can be for others to understand and maintain your code without proper documentation. So, the moral of the story is: DRY code is great as long as you [write great documentation.](https://docs.getdbt.com/docs/building-a-dbt-project/documentation) +There is, of course, a tradeoff between simplicity and conciseness in code. The more abstractions you create, the harder it can be for others to understand and maintain your code without proper documentation. So, the moral of the story is: DRY code is great as long as you [write great documentation.](https://docs.getdbt.com/docs/collaborate/documentation) ### Save time & energy @@ -43,7 +43,7 @@ Most teams have essential business logic that defines the successes and failures By writing DRY definitions for key business logic and metrics that are referenced throughout a dbt project and/or BI (business intelligence) tool, data teams can create those single, unambiguous, and authoritative representations for their essential transformations. Gone are the days of 15 different definitions and values for churn, and in are the days of standardization and DRYness. :::note Experimental dbt Metrics! -dbt v1.0 currently supports the use of experimental metrics, time series aggregations over a table that support zero or one dimensions. Using [dbt Metrics](https://docs.getdbt.com/docs/building-a-dbt-project/metrics), data teams can define metric calculations, ownerships, and definitions in a yaml file that lives within their dbt project. dbt Metrics are in their experimental stage; if you’re interesting in learning more about dbt Metrics, please make sure to join the #dbt-metrics-and-server channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). +dbt v1.0 currently supports the use of experimental metrics, time series aggregations over a table that support zero or one dimensions. Using [dbt Metrics](/docs/build/metrics), data teams can define metric calculations, ownerships, and definitions in a YAML file that lives within their dbt project. dbt Metrics are in their experimental stage; if you’re interesting in learning more about dbt Metrics, please make sure to join the #dbt-metrics-and-server channel in the [dbt Community Slack](https://www.getdbt.com/community/join-the-community/). ::: ## Tools to help you write DRY code @@ -60,7 +60,7 @@ If you’re referencing a specific query, perhaps for aggregations that join bac ### View materializations -View [materializations](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/materializations) are also extremely useful for abstracting code that might otherwise be repeated often. A is a defined passthrough SQL query that can be run against a database. Unlike a table, it doesn’t store data, but it defines the logic that you need to use to fetch the underlying data. +View [materializations](https://docs.getdbt.com/docs/build/materializations) are also extremely useful for abstracting code that might otherwise be repeated often. A is a defined passthrough SQL query that can be run against a database. Unlike a table, it doesn’t store data, but it defines the logic that you need to use to fetch the underlying data. If you’re referencing the same query, CTE, or block of code, throughout multiple data models, that’s probably a good sign that code should be its own view. @@ -80,7 +80,7 @@ To make any subsequent references to this view DRY-er, you simply reference the ### dbt macros and packages -dbt also supports the use of [macros](https://docs.getdbt.com/docs/building-a-dbt-project/jinja-macros) and [packages](https://docs.getdbt.com/docs/building-a-dbt-project/package-management) to help data folks write DRY code in their dbt projects. Macros are Jinja-supported functions that can be reused and applied throughout a dbt project. Packages are libraries of dbt code, typically models, macros, and/or tests, that can be referenced and used in a dbt project. They are a great way to use transformations for common data sources (like [ad platforms](https://hub.getdbt.com/dbt-labs/facebook_ads/latest/)) or use more [custom tests for your data models](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) _without having to write out the code yourself_. At the end of the day, is there really anything more DRY than that? +dbt also supports the use of [macros](/docs/build/jinja-macros) and [packages](https://docs.getdbt.com/docs/build/packages) to help data folks write DRY code in their dbt projects. Macros are Jinja-supported functions that can be reused and applied throughout a dbt project. Packages are libraries of dbt code, typically models, macros, and/or tests, that can be referenced and used in a dbt project. They are a great way to use transformations for common data sources (like [ad platforms](https://hub.getdbt.com/dbt-labs/facebook_ads/latest/)) or use more [custom tests for your data models](https://hub.getdbt.com/calogica/dbt_expectations/0.1.2/) _without having to write out the code yourself_. At the end of the day, is there really anything more DRY than that? ## Conclusion diff --git a/website/docs/terms/elt.md b/website/docs/terms/elt.md index b36c0486d9e..59cfc77778c 100644 --- a/website/docs/terms/elt.md +++ b/website/docs/terms/elt.md @@ -1,12 +1,12 @@ --- id: elt -title: ELT +title: What is ELT (Extract, Load, Transform)? description: ELT is the process of first extraction data from different sources, then loading it into a data warehouse, and finally transforming it. displayText: ELT hoverSnippet: Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, loading it into a target data warehouse, and finally transforming it. --- - ELT: What it is and why it's often better than ETL + What is ELT (Extract, Load, Transform)? How does it differ from ETL? Extract, Load, Transform (ELT) is the process of first extracting data from different data sources, then loading it into a target , and finally transforming it. @@ -61,7 +61,7 @@ Common ways to transform your data include leveraging modern technologies such a ## ELT vs ETL -The primary difference between the traditional ETL and the modern ELT workflow is when data transformation and loading take place. In ETL workflows, data extracted from data sources is transformed prior to being loaded into target data platforms. Newer ELT workflows have data being transformed after being loaded into the data platform of choice. Why is this such a big deal? +The primary difference between the traditional ETL and the modern ELT workflow is when [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) and loading take place. In ETL workflows, data extracted from data sources is transformed prior to being loaded into target data platforms. Newer ELT workflows have data being transformed after being loaded into the data platform of choice. Why is this such a big deal? | | ELT | ETL | |---|---|---| @@ -81,7 +81,7 @@ Why has ELT adoption grown so quickly in recent years? A few reasons: You often hear about the benefits of the ELT workflow to data, but you can sometimes forget to talk about the benefits it brings to people. There are a variety of benefits that this workflow brings to the actual data (which we’ll outline in detail below), such as the ability to recreate historical transformations, test data and data models, and more. We'll also want to use this section to emphasize the empowerment the ELT workflow brings to both data team members and business stakeholders. -### Data as code +### ELT benefit #1: Data as code Ok we said it earlier: The ELT workflow allows data teams to function like software engineers. But what does this really mean? How does it actually impact your data? @@ -100,7 +100,7 @@ As your business grows, the number of data sources correspondingly increases alo The ELT workflow capitalizes on transformations occurring last to provide flexibility and software engineering best practices to data transformation. Instead of having to worry about how your extraction scripts scale as your data increases, data can be extracted and loaded automatically with a few clicks. -### Bring the power to the people +### ELT benefit #2: Bring the power to the people The ELT workflow opens up a world of opportunity for the people that work on that data, not just the data itself. @@ -122,7 +122,7 @@ As mentioned earlier, the recent development of certain technologies and product |---|---|---|---| | Fivetran/HVR | E, some T, L | Fivetran is a SaaS company that helps data teams extract, load, and perform some transformation on their data. Fivetran easily integrates with modern data warehouses and dbt. They also offer transformations that leverage dbt Core. | :x: | | Stitch by Talend | E, L | Stitch (part of Talend) is another SaaS product that has many data connectors to extract data and load it into data warehouses. | :x: | -| Airbyte | E, L | Airbyte is an open-source and cloud service that allows teams to create and load pipelines. | :white_check_mark: | +| Airbyte | E, L | Airbyte is an open-source and cloud service that allows teams to create data extraction and load pipelines. | :white_check_mark: | | Funnel | E, some T, L | Funnel is another product that can extract and load data. Funnel’s data connectors are primarily focused around marketing data sources. | :x: | | dbt | T | dbt is the transformation tool that enables data analysts and engineers to transform, test, and document data in the cloud data warehouse. dbt offers both an open-source and cloud-based product. | :white_check_mark: | @@ -136,7 +136,4 @@ Here's some of our favorite content about the ELT workflow: - [The case for the ELT workflow](https://www.getdbt.com/analytics-engineering/case-for-elt-workflow/) - [A love letter to ETL tools](https://www.getdbt.com/analytics-engineering/etl-tools-a-love-letter/) -- [What, exactly, is dbt?](https://blog.getdbt.com/what-exactly-is-dbt/) -- [What is analytics engineering?](https://www.getdbt.com/what-is-analytics-engineering/) -- [How Fivetran operationalizes data transformations](https://www.getdbt.com/coalesce-2021/how-fivetran-operationalizes-data-transformations/) - +- [What is dbt?](https://getdbt.com/product/what-is-dbt/) \ No newline at end of file diff --git a/website/docs/terms/etl.md b/website/docs/terms/etl.md index acc0e7b1c40..321f59a65d0 100644 --- a/website/docs/terms/etl.md +++ b/website/docs/terms/etl.md @@ -1,16 +1,16 @@ --- id: etl -title: ETL +title: What is ETL (Extract, Transform, Load)? description: ETL is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. displayText: ETL -hoverSnippet: Extract, Transform, Load (ETL)is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. +hoverSnippet: Extract, Transform, Load (ETL) is the process of first extracting data from a data source, transforming it, and then loading it into a target data warehouse. --- - ETL: What is it and is it still relevant? - dbt Labs + What is ETL (Extract, Transform, Load)? How has it evolved? -ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target . In ETL workflows, much of the meaningful data transformation occurs outside this primary pipeline in a downstream business intelligence (BI) platform. +ETL, or “Extract, Transform, Load”, is the process of first extracting data from a data source, transforming it, and then loading it into a target . In ETL workflows, much of the meaningful [data transformation](https://www.getdbt.com/analytics-engineering/transformation/) occurs outside this primary pipeline in a downstream business intelligence (BI) platform. ETL is contrasted with the newer (Extract, Load, Transform) workflow, where transformation occurs after data has been loaded into the target data warehouse. In many ways, the ETL workflow could have been renamed the ETLT workflow, because a considerable portion of meaningful data transformations happen outside the data pipeline. The same transformations can occur in both ETL and ELT workflows, the primary difference is *when* (inside or outside the primary ETL workflow) and *where* the data is transformed (ETL platform/BI tool/data warehouse). @@ -48,20 +48,23 @@ In the final stage, the transformed data is loaded into your target data warehou The ETL workflow implies that your raw data does not live in your data warehouse. *Because transformations occur before load, only transformed data lives in your data warehouse in the ETL process.* This can make it harder to ensure that transformations are performing the correct functionality. -## ETL use cases and challenges - -### Example ETL use cases +## How ETL is being used While ELT adoption is growing, we still see ETL use cases for processing large volumes of data and adhering to strong data governance principles. -- **Efficiently normalize large volumes of data**: ETL can be an efficient way to perform simple normalizations across large data sets. Doing these lighter transformations across a large volume of data during loading can help get the data formatted properly and quickly for downstream use. In addition, end business users sometimes need quick access to raw or somewhat normalized data. Through an ETL workflow, data teams can conduct lightweight transformations on data sources and quickly expose them in their target data warehouse and downstream BI tool. -- **Hash PII prior to load**: Some companies will want to mask, hash, or remove PII values before it enters their data warehouse. In an ETL workflow, teams can transform PII to hashed values or remove them completely during the loading process. This limits where PII is available or accessible in an organization’s data warehouse. +### ETL to efficiently normalize large volumes of data + +ETL can be an efficient way to perform simple normalizations across large data sets. Doing these lighter transformations across a large volume of data during loading can help get the data formatted properly and quickly for downstream use. In addition, end business users sometimes need quick access to raw or somewhat normalized data. Through an ETL workflow, data teams can conduct lightweight transformations on data sources and quickly expose them in their target data warehouse and downstream BI tool. + +### ETL for hashing PII prior to load + +Some companies will want to mask, hash, or remove PII values before it enters their data warehouse. In an ETL workflow, teams can transform PII to hashed values or remove them completely during the loading process. This limits where PII is available or accessible in an organization’s data warehouse. -### ETL drawbacks +## ETL challenges There are reasons ETL has persisted as a workflow for over twenty years. However, there are also reasons why there’s been such immense innovation in this part of the data world in the past decade. From our perspective, the technical and human limitations we describe below are some of the reasons ELT has surpassed ETL as the preferred workflow. -#### Technical limitations +### ETL challenge #1: Technical limitations **Limited or lack of version control** @@ -75,7 +78,7 @@ Some teams with ETL workflows only implement much of their business logic in the While data quality testing can be done in ETL processes, not having the raw data living somewhere in the data warehouse inevitably makes it harder to ensure data models are performing the correct functionality. In addition, quality control continually gets harder as the number of data sources and pipelines within your system grows. -#### Human limitations +### ETL challenge #2: Human limitations **Data analysts can be excluded from ETL work** diff --git a/website/docs/terms/idempotent.md b/website/docs/terms/idempotent.md index 245728953c4..8772ba58b62 100644 --- a/website/docs/terms/idempotent.md +++ b/website/docs/terms/idempotent.md @@ -20,4 +20,4 @@ A non-idempotent version of the "_Save_" button might do something like "Append If word processors only gave us non-idempotent "Append paragraph" / "Update paragraph" / "Delete paragraph" operations, then saving our document changes would be a lot more difficult! We'd have to keep track of which paragraphs we previously saved, and either make sure to not save them again or have a process in place to regularly clean up duplicate paragraphs. The implementation of the "_Save_" button in word processors takes the collection of low-level non-idempotent filesystem operations (read/append/overwrite/delete), and systematically runs them in a certain order so that the _user_ doesn't have to deal with the non-idempotency. The user can just focus on writing -- choosing words, editing for clarity, ensuring paragraphs aren't too long, etc. -- and the word processor deals with making sure the words get persisted properly to disk. -This word processing analogy is very similar to what dbt does for data transformation: it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](https://docs.getdbt.com/docs/about/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/guides/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. +This word processing analogy is very similar to what dbt does for [data transformation](https://www.getdbt.com/analytics-engineering/transformation/): it takes the collection of low-level non-idempotent database operations (`SELECT`/`INSERT`/`UPDATE`/`DELETE` -- collectively known as DML statements), and systematically runs them in a certain order so that analytics engineers don't have to deal with non-idempotency. We can just focus on the data -- [choosing good model and column names](https://docs.getdbt.com/blog/on-the-importance-of-naming), [documenting them](/community/resources/viewpoint#documentation), [ensuring data consumers can understand them](https://docs.getdbt.com/docs/guides/best-practices#consider-the-information-architecture-of-your-data-warehouse), etc. -- and [`dbt run`](https://docs.getdbt.com/reference/commands/run) will make sure the database ends up in the right state. diff --git a/website/docs/terms/materialization.md b/website/docs/terms/materialization.md index fdeaaebfcc8..328076f1483 100644 --- a/website/docs/terms/materialization.md +++ b/website/docs/terms/materialization.md @@ -11,7 +11,7 @@ hoverSnippet: The exact Data Definition Language (DDL) that dbt will use when cr :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: The exact Data Definition Language (DDL) that dbt will use when creating the model’s equivalent in a . It's the manner in which the data is represented, and each of those options is defined either canonically (tables, views, incremental), or bespoke. diff --git a/website/docs/terms/model.md b/website/docs/terms/model.md new file mode 100644 index 00000000000..c589cc196a7 --- /dev/null +++ b/website/docs/terms/model.md @@ -0,0 +1,9 @@ +--- +id: model +title: Model +description: A model is an essential building block of the DAG +displayText: model +hoverSnippet: A model is an essential building block of the DAG +--- + +A model is an essential building block of the DAG that lives in a single file and contains logic that transforms data. This logic can be expressed as a SQL `select` statement or a Python dataframe operation. Models can be materialized in the warehouse in different ways — most of these materializations require models to be built in the warehouse. \ No newline at end of file diff --git a/website/docs/terms/monotonically-increasing.md b/website/docs/terms/monotonically-increasing.md new file mode 100644 index 00000000000..b4e3987995d --- /dev/null +++ b/website/docs/terms/monotonically-increasing.md @@ -0,0 +1,11 @@ +--- +id: monotonically-increasing +title: Monotonically increasing +description: A monotonically increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. +displayText: monotonically increasing +hoverSnippet: A monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example, the sequences 1, 6, 7, 11, 131 or 2, 5, 5, 5, 6, 10. +--- + +Monotonicity means unchanging (think monotone); a monotonic sequence is a sequence where the order of the value of the elements does not change. In other words, a monotonically-increasing sequence is a sequence whose values are sorted in ascending order and do not decrease. For example the sequences `[1, 6, 7, 11, 131]` or `[2, 5, 5, 5, 6, 10]`.. + +Monotonically-increasing values often appear in primary keys generated by production systems. In an analytics engineering context, you should avoid generating such values or assuming their existence in your models, because they make it more difficult to create an data model. Instead you should create a which is derived from the unique component(s) of a row. diff --git a/website/docs/terms/predicate-pushdown.md b/website/docs/terms/predicate-pushdown.md new file mode 100644 index 00000000000..8e9bad85b6b --- /dev/null +++ b/website/docs/terms/predicate-pushdown.md @@ -0,0 +1,10 @@ +--- +id: predicate-pushdown +title: predicate pushdown +description: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +displayText: Predicate pushdown +hoverSnippet: A predicate pushdown is an expression used to determine what rows in a database apply to a particular query +--- + +A predicate pushdown is an expression used to determine what rows in a database apply to a particular query. For example, if you filter in a `WHERE` clause based on a specific dimension value, the database searches to determine what values in the database apply to the query. The optimization known as "predicate pushdown" involves applying this filtering process to the database, leading to enhanced and faster query performance. + diff --git a/website/docs/terms/primary-key.md b/website/docs/terms/primary-key.md index 2baa2ee8171..5921d3ca655 100644 --- a/website/docs/terms/primary-key.md +++ b/website/docs/terms/primary-key.md @@ -33,7 +33,7 @@ Primary keys can be established two ways: naturally or derived through the data A note on primary key data types: natural keys will often take the form of an integer or other numeric value (ex. 45932). Surrogate keys, on the other hand, are usually alphanumeric strings since they are hashed values (ex. ‘62aef884fbe3470ce7d9a92140b09b17’). :::tip Tip -dbt supports [packages](https://docs.getdbt.com/docs/building-a-dbt-project/package-management), libraries of open-source macros and data models, to help data teams avoid doing duplicative work. One of these packages, [dbt_utils](https://github.com/dbt-labs/dbt-utils), contains a series of macros that are built to alleviate common struggles in data modeling. The [surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) macro offers a DRY (don’t repeat yourself) solution to creating surrogate keys across different data warehouses in the event that your data doesn’t contain natural keys. +dbt supports [packages](https://docs.getdbt.com/docs/build/packages), libraries of open-source macros and data models, to help data teams avoid doing duplicative work. One of these packages, [dbt_utils](https://github.com/dbt-labs/dbt-utils), contains a series of macros that are built to alleviate common struggles in data modeling. The [surrogate_key](https://github.com/dbt-labs/dbt-utils#surrogate_key-source) macro offers a DRY (don’t repeat yourself) solution to creating surrogate keys across different data warehouses in the event that your data doesn’t contain natural keys. ::: ## Data warehouse support for primary keys @@ -81,7 +81,7 @@ The table below gives an overview of primary key support and enforcement in some
          - @@ -112,7 +112,7 @@ BigQuery is pretty unique here in that it doesn’t support or enforce primary k ### Databricks -Databricks’ Delta tables don’t provide classical support for primary keys; instead, they call them constraints. Databricks currently offers two constraint types: `not-null` and `check`. The `not-null` one is pretty straightforward, but the `check` constraint is more unique to Databricks. With the `check` constraint, you can test that a certain boolean expression executes as `true` for each row in a table. This constraint is more likely to be helpful for ensuring accepted values are met for fields rather than for primary key requirements. +Databricks’ Delta tables in Unity Catalog provide support for declaring [informational primary keys](https://docs.databricks.com/tables/constraints.html#declare-primary-key-and-foreign-key-relationships). These primary key constraints are not enforced. Databricks currently offers [two enforced constraint](https://docs.databricks.com/tables/constraints.html#enforced-constraints-on-databricks) types: `not-null` and `check`. The `not-null` one is pretty straightforward, but the `check` constraint is more unique to Databricks. With the `check` constraint, you can test that a certain boolean expression executes as `true` for each row in a table. This constraint is more likely to be helpful for ensuring accepted values are met for fields rather than for primary key requirements. ### Postgres @@ -151,7 +151,7 @@ When we talk about testing our primary keys, we really mean testing their unique 2. For databases that don’t offer support and enforcement of primary keys, you’re going to need to regularly test that primary keys aren’t violating their golden rule of uniqueness and non-nullness. To do this, we recommend implementing a tool like dbt that allows you to define version-controlled and code-based tests on your data models. Using these tests, you should create [not null](https://docs.getdbt.com/reference/resource-properties/tests#not_null) and [unique](https://docs.getdbt.com/reference/resource-properties/tests#unique) tests for every primary key field throughout your dbt project. Other methods for primary key testing may look like writing custom tests or ad hoc queries that check for uniqueness and non-nullness. :::tip Tip -You can use dbt’s [documentation](https://docs.getdbt.com/docs/building-a-dbt-project/documentation) and [testing](https://docs.getdbt.com/reference/resource-properties/tests) capabilities to clearly identify and QA primary keys in your data models. For your primary key column, you should mention that the field is the unique identifier for that table and test for uniqueness and non-nullness. +You can use dbt’s [documentation](https://docs.getdbt.com/docs/collaborate/documentation) and [testing](https://docs.getdbt.com/reference/resource-properties/tests) capabilities to clearly identify and QA primary keys in your data models. For your primary key column, you should mention that the field is the unique identifier for that table and test for uniqueness and non-nullness. ::: ## Conclusion @@ -162,4 +162,4 @@ Say it with me or get it tattooed on your lower back: every database object in y * [Testing primary keys in dbt](https://docs.getdbt.com/blog/primary-key-testing) * [Surrogate keys and dbt](https://docs.getdbt.com/blog/sql-surrogate-keys) -* [dbt Constraints Snowflake Labs package](https://hub.getdbt.com/snowflake-labs/dbt_constraints/latest/) \ No newline at end of file +* [dbt Constraints Snowflake Labs package](https://hub.getdbt.com/snowflake-labs/dbt_constraints/latest/) diff --git a/website/docs/terms/reverse-etl.md b/website/docs/terms/reverse-etl.md index 522ab364174..a3ccd0b0f70 100644 --- a/website/docs/terms/reverse-etl.md +++ b/website/docs/terms/reverse-etl.md @@ -22,7 +22,7 @@ In the reverse ETL process, transformed data is synced from a data warehouse to ![A diagram depicting how the reverse ETL process works. It starts with data being extract from data sources like email CRMs, Facebook Ad platforms, backend databases, and NetSuite. The raw data is then loaded into a data warehouse. After loading, the data is transformed and modeled. The modeled data is then loaded directly back into the tools that created the data, like Email CRMs, Facebook Ad platforms, and others so the insights are more accessible to business users.](/img/docs/terms/reverse-etl/reverse-etl-diagram.png) -The power of reverse ETL comes from sending down *already transformed data* to business platforms. Raw data, while beautiful in its own way, typically lacks the structure, aggregations, and aliasing to be useful for end business users off the bat. After data teams transform data for business use in ELT pipelines, typically to expose in an end business intelligence (BI) tool, they can also send this cleaned and meaningful data to other platforms where business users can derive value using [reverse ETL tools](#reverse-etl-tools). +The power of reverse ETL comes from sending down *already transformed data* to business platforms. Raw data, while beautiful in its own way, typically lacks the structure, aggregations, and aliasing to be useful for end business users off the bat. After data teams transform data for business use in pipelines, typically to expose in an end business intelligence (BI) tool, they can also send this cleaned and meaningful data to other platforms where business users can derive value using [reverse ETL tools](#reverse-etl-tools). Data teams can choose to write additional transformations that may need to happen for end business tools in reverse ETL tools themselves or by creating [additional models in dbt](https://getdbt.com/open-source-data-culture/reverse-etl-playbook/). @@ -76,6 +76,7 @@ Reverse ETL tools typically establish the connection between your data warehouse | Tool | Description | Open source option? | |:---:|:---:|:---:| | Hightouch | A platform to sync data models and create custom audiences for downstream business platforms. | :x: | +| Polytomic | A unified sync platform for syncing to and from data warehouses (ETL and Reverse ETL), databases, business apps, APIs, and spreadsheets. | :x: | | Census | Another reverse ETL tool that can sync data from your data warehouse to your go-to-market tools. | :x: | | Rudderstack | Also a CDP (customer data platform), Rudderstack additionally supports pushing down data and audience to external tools, such as ad platforms and email CRMs. | :white_check_mark: | | Grouparoo | Grouparoo, part of Airbyte, is an open source framework to move data from data warehouses to different cloud-based tools. | :white_check_mark: | diff --git a/website/docs/terms/surrogate-key.md b/website/docs/terms/surrogate-key.md index 718d3f53c92..e57a0b74a7f 100644 --- a/website/docs/terms/surrogate-key.md +++ b/website/docs/terms/surrogate-key.md @@ -244,7 +244,7 @@ dbt supports several macros to help data folks write DRY (don’t repeat yoursel ## Performance concerns for surrogate keys -In the past, you may have seen surrogate keys take the form of monotonically increasing integers (ex. 1, 2, 3, 4). These surrogate keys were often limited to 4-bit integers that could be indexed quickly. However, in the practice of analytics engineering, surrogate keys derived from the data often take the form of a hashed string value. Given this form, these surrogate keys are not necessarily optimized for performance for large table scans and complex joins. For large data models (millions, billions, trillions of rows) that have surrogate keys, you should materialize them as tables or [incremental models](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/configuring-incremental-models) to help make joining entities more efficient. +In the past, you may have seen surrogate keys take the form of integers (ex. 1, 2, 3, 4). These surrogate keys were often limited to 4-bit integers that could be indexed quickly. However, in the practice of analytics engineering, surrogate keys derived from the data often take the form of a hashed string value. Given this form, these surrogate keys are not necessarily optimized for performance for large table scans and complex joins. For large data models (millions, billions, trillions of rows) that have surrogate keys, you should materialize them as tables or [incremental models](https://docs.getdbt.com/docs/build/incremental-models) to help make joining entities more efficient. ## Conclusion diff --git a/website/docs/terms/table.md b/website/docs/terms/table.md index 08289a84c8e..cbe36ec1315 100644 --- a/website/docs/terms/table.md +++ b/website/docs/terms/table.md @@ -1,11 +1,12 @@ --- id: table title: Table +description: "Read this guide to understand how tables work in dbt." displayText: table hoverSnippet: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. --- :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: In simplest terms, a table is the direct storage of data in rows and columns. Think excel sheet with raw values in each of the cells. diff --git a/website/docs/terms/view.md b/website/docs/terms/view.md index 7357e8c60c8..90cd5d1f36f 100644 --- a/website/docs/terms/view.md +++ b/website/docs/terms/view.md @@ -1,11 +1,12 @@ --- id: view title: View +description: Read this guide to understand how views work in dbt. displayText: view hoverSnippet: A view (as opposed to a table) is a defined passthrough SQL query that can be run against a database (or data warehouse). --- :::important This page could use some love -This term would benefit from additional depth and examples. Have knowledge to contribute? [Create a discussion in the docs.getdbt.com GitHub repository](https://github.com/dbt-labs/docs.getdbt.com/discussions) to begin the process of becoming a glossary contributor! +This term would benefit from additional depth and examples. Have knowledge to contribute? [Create an issue in the docs.getdbt.com repository](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) to begin the process of becoming a glossary contributor! ::: A view (as opposed to a ) is a defined passthrough SQL query that can be run against a database (or ). A view doesn’t store data, like a table does, but it defines the logic that you need to fetch the underlying data. diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 1d062efa979..0eae62ecec3 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -1,7 +1,7 @@ const path = require("path"); const math = require("remark-math"); const katex = require("rehype-katex"); -const { versions, versionedPages } = require("./dbt-versions"); +const { versions, versionedPages, versionedCategories } = require("./dbt-versions"); require("dotenv").config(); /* Debugging */ @@ -44,10 +44,16 @@ var siteSettings = { tagline: "End user documentation, guides and technical reference for dbt", title: "dbt Developer Hub", url: SITE_URL, - onBrokenLinks: "warn", + onBrokenLinks: "throw", onBrokenMarkdownLinks: "throw", trailingSlash: false, themeConfig: { + docs:{ + sidebar: { + hideable: true, + autoCollapseCategories: true, + }, + }, image: "/img/avatar.png", colorMode: { defaultMode: "light", @@ -63,15 +69,18 @@ var siteSettings = { //debug: true, }, announcementBar: { - id: "live_qa", + id: "biweekly-demos", content: - "Take the 5-minute dbt Community Survey!", - backgroundColor: "#047377", + "Register now for Coalesce 2023. The Analytics Engineering Conference!", + backgroundColor: "#7444FD", textColor: "#fff", isCloseable: true, }, announcementBarActive: true, - announcementBarLink: "https://www.surveymonkey.com/r/XP2N8Z3", + announcementBarLink: "https://coalesce.getdbt.com/", + // Set community spotlight member on homepage + // This is the ID for a specific file under docs/community/spotlight + communitySpotlightMember: "faith-lierheimer", prism: { theme: (() => { var theme = require("prism-react-renderer/themes/nightOwl"); @@ -107,13 +116,13 @@ var siteSettings = { activeBaseRegex: "docs/(?!(dbt-cloud))", }, { - to: "/reference/dbt_project.yml", + to: "reference/references-overview", label: "Reference", position: "left", activeBasePath: "reference", }, { - label: "Learn", + label: "Resources", position: "right", items: [ { @@ -124,6 +133,10 @@ var siteSettings = { label: 'Guides', to: '/guides/best-practices', }, + { + label: "Quickstarts", + to: "/quickstarts", + }, { label: "Developer Blog", to: "/blog", @@ -154,6 +167,10 @@ var siteSettings = { label: "Events", to: "/community/events", }, + { + label: "Spotlight", + to: "/community/spotlight", + }, ], }, { @@ -172,7 +189,7 @@ var siteSettings = { Terms of Service Privacy Policy Security - +
          Databricks ❌ +
          + <> + {csvData.map((row, i) => ( + i === 0 + ? ( + + + {row.map(col => ( + + ))} + + + ) : ( + + + {row.map(col => ( + + ))} + + + ) + ))} + +
          {col}
          {col}
          + + )} + {showLineage &&
          + { + handleFileSelect({target: { dataset : node.data }}) + }} /> +
          } + + + + + )} + + ); +} + +export default dbtEditor; diff --git a/website/src/components/dbt-editor/menu-item.js b/website/src/components/dbt-editor/menu-item.js new file mode 100644 index 00000000000..2857b65b4c8 --- /dev/null +++ b/website/src/components/dbt-editor/menu-item.js @@ -0,0 +1,65 @@ +import React, { useState } from 'react' +import styles from './styles.module.css'; + +// Build submenu for each menu item +// This self-invokes the MenuItem component from within itself +const buildSubItems = (thisSubItem, isResource, handleFileSelect) => { + // Group all directories & nodes for items + const subItems = [] + thisSubItem?.directories?.length && + subItems.push(...thisSubItem.directories) + thisSubItem?.nodes?.length && + subItems.push(...thisSubItem.nodes) + + // Return each submenu item as its own menu item + return +} + +export default function MenuItem({ item, name, subItems, defaultOpen = false, isResource = false, isNode, handleFileSelect }) { + const [itemOpen, setItemOpen] = useState(defaultOpen) + return ( +
        • + handleFileSelect(e) + : () => setItemOpen(!itemOpen)} + data-resource_type={item?.resourceType && item.resourceType} + data-node_name={item?.isNode && item.node} + data-file_name={item?.name} + > + {item?.isNode ? ( + + ) : ( + + )} + {name} + + {itemOpen && ( + <> + {subItems && ( +
            + {subItems.map(subItem => ( + + {buildSubItems(subItem, isResource, handleFileSelect)} + + ))} +
          + )} + + )} +
        • + ) +} diff --git a/website/src/components/dbt-editor/styles.module.css b/website/src/components/dbt-editor/styles.module.css new file mode 100644 index 00000000000..1e530a93bab --- /dev/null +++ b/website/src/components/dbt-editor/styles.module.css @@ -0,0 +1,209 @@ +.dbtEditor { + width: 100%; + max-width: 100%; + max-height: 770px; + overflow: hidden; + display: flex; + flex-wrap: wrap; + color: #111827; + border-radius: 10px; + border: 1px solid #E5E7EB; + background: #fff; + position: relative; +} +.dbtEditor.fullHeight { + max-height: 100%; + height: 100%; +} +@media (min-width: 1200px) { + .dbtEditor { + flex-wrap: nowrap; + } +} + +.dbtLineageContainer { + height: 265px; + width: 100%; + padding: 10px; +} + +.dbtLineageContainer > div { + height: 100%; + width: 100%; +} + +/* Sidebar */ +.dbtEditorSidebar { + flex: 1 0 100%; + max-height: 200px; + overflow-y: auto; + overflow-x: hidden; + background: #F9FAFB; + border-right: 1px solid #E5E7EB; + padding: 20px; + font-size: 14px; + border-top-left-radius: 10px; + border-bottom-left-radius: 10px; +} +@media (min-width: 1200px) { + .dbtEditorSidebar { + flex: 1 0 256px; + max-width: 256px; + max-height: initial; + } +} +.dbtEditorSidebar .sidebarHeader { + font-weight: 600; + margin-bottom: 5px; + display: block; +} +.dbtEditorSidebar .sidebarList { + font-weight: 400; + list-style: none; + padding-left: 0; +} +.dbtEditorSidebar .sidebarNestedList { + list-style: none; + padding-left: 10px; +} +.dbtEditorSidebar .sidebarList li { + font-size: 12px; + cursor: pointer; +} +.listItem { + display: flex; + align-items: center; +} +.dbtEditorSidebar .sidebarList li img { + margin-right: 5px; +} +.hideItem { + display: none; +} + +/* Main editor */ +.dbtEditorMain { + flex: 1 1 100%; + overflow: auto; + height: 570px; + width: 100%; +} +.dbtEditor.fullHeight .dbtEditorMain { + height: 100%; +} +@media (min-width: 1200px) { + .dbtEditorMain { + flex: 1 1 calc(100% - 256px); + height: initial; + max-width: initial; + } +} + +/* CLI */ +.dbtEditorCli { + padding: 20px 0; + height: 50%; +} + +/* Actions */ +.dbtEditorActions { + display: flex; + background: #F9FAFB; + padding: 5px 20px 5px 24px; +} +.editorAction { + margin-right: 15px; + background: #fff; + border: 1px solid #D1D5DB; + color: #111827; + box-shadow: 0px 1px 2px rgba(0, 0, 0, 0.05); + border-radius: 4px; + padding: 7px 11px; + transition: 100ms all ease-in-out; + cursor: pointer; +} +.editorAction:hover { + background: #047377; + border: 1px solid #047377; + color: #fff; +} +.editorAction:last-of-type { + margin-right: 0; +} + +/* Results */ +.resultsHeader { + color: #1F2937; + background: #fff; + padding: 10px; + font-size: 12px; + display: flex; + align-items: center; +} +.resultsHeader span { + color: #4B5563; +} +.resultsHeader img { + margin-left: 5px; +} + +.dbtEditorResults table { + display: table; + color: #111827; + background: #fff; + margin: 0; +} +.dbtEditorResults table, .dbtEditorResults table thead, .dbtEditorResults table tbody { + width: 100%; +} +.dbtEditorResults tr { + border-bottom: 1px solid #E5E7EB; +} +.dbtEditorResults tr:last-of-type { + border-bottom: none; +} +.dbtEditorResults table thead tr { + border-top: 1px solid #E5E7EB; +} +.dbtEditorResults table th { + text-align: left; + color: #4B5563; + font-weight: 600; +} +.dbtEditorResults table th, .dbtEditorResults table td { + font-size: 14px; + border: none; + color: #111827 !important; +} +.dbtEditorResults table tr:nth-child(2n) { + background: #fff; +} +.dbtEditorResults table tr th:first-of-type, +.dbtEditorResults table tr td:first-of-type { + padding-left: 24px; +} + +.expandIcon { + position: absolute; + top: 1rem; + right: 1rem; + background: rgba(0,0,0,.2); + z-index: 2; + color: #fff; + font-size: .8rem; + border-radius: 5px; + height: 25px; + width: 25px; + transition: all ease-in-out 100ms; + cursor: pointer; +} +.expandIcon:hover { + background: rgba(0,0,0,.25); +} +.expandIcon i { + display: flex; + align-items: center; + justify-content: center; + height: 100%; + width: 100%; +} diff --git a/website/src/components/dbt-editor/utils/build-sidebar.js b/website/src/components/dbt-editor/utils/build-sidebar.js new file mode 100644 index 00000000000..767622c7f1d --- /dev/null +++ b/website/src/components/dbt-editor/utils/build-sidebar.js @@ -0,0 +1,75 @@ +// Util: Get packages +export const buildSidebar = (nodes, tag) => { + const projectData = [] + for(let node in nodes) { + const thisNode = nodes[node] + + // If node does not included the current tag, continue to next node + if(!thisNode?.tags?.includes(tag)) continue + + const nodePath = thisNode?.path?.split('/') + // If path not available in node, skip item in loop + if(!nodePath) continue + + // Build node object + const nodeObject = { + node, + name: thisNode.name, + isNode: true, + resourceType: thisNode.resource_type + } + + // Set top-level directories + let thisPackage = projectData.find(project => project?.project === thisNode.package_name) + if(!thisPackage) { + // Create new top-level package if not found + thisPackage = { + project: thisNode.package_name, + resources: [], + } + projectData.push(thisPackage) + } + + // Set resources + let packagesResources = thisPackage?.resources?.find(resource => resource?.name === thisNode.resource_type) + if(!packagesResources) { + packagesResources = { + name: thisNode.resource_type, + directories: [], + nodes: [] + } + thisPackage.resources.push(packagesResources) + } + + // Set directories + const directory = thisNode.path.substr(0, thisNode.path.indexOf('/')) + + if(!directory) { + // Add node to top level + let packageNodes = packagesResources?.nodes?.find(node => node.name === thisNode.name) + if(!packageNodes) { + packageNodes = nodeObject + packagesResources.nodes.push(packageNodes) + } + } else { + // Add node to directory + let directoryNode = packagesResources?.directories?.find(dir => { + if(dir.name === directory) return true + }) + // If this directory does not exist + // create new directory and add current node + // Else, add node to existing directory + if(!directoryNode && directory) { + directoryNode = { + name: directory, + nodes: [nodeObject] + } + packagesResources.directories.push(directoryNode) + } else { + directoryNode.nodes.push(nodeObject) + } + } + } + + return projectData +} diff --git a/website/src/components/dbt-editor/utils/parse-csv.js b/website/src/components/dbt-editor/utils/parse-csv.js new file mode 100644 index 00000000000..ee8beea5d7c --- /dev/null +++ b/website/src/components/dbt-editor/utils/parse-csv.js @@ -0,0 +1,38 @@ +import Papa from "papaparse" +import axios from 'axios' + +export const parseCsv = async (project, filename) => { + try { + const file = await axios(`/dbt_projects/${project}/seeds/${filename}.csv`) + if(!file?.data) throw new Error('unable to find project data.') + + // If user clicks the parse button without + // a file we show a error + if (!file) throw new Error("Enter a valid file") + + let csvData = [] + // let csvHeaders + Papa.parse(file.data, { + header: false, + complete: (results) => { + const { data } = results + csvData = data + }, + error: (err) => { + console.log(err) + }, + }); + + // If data available, return data + // Otherwise throw error + if(csvData?.length > 0) { + return csvData.slice(0, 500) + } else { + throw new Error('csvData not found') + } + } catch(err) { + console.log('Unable to parse CSV.', err) + } + + +} diff --git a/website/src/components/discourse/discourse.cy.js b/website/src/components/discourse/discourse.cy.js new file mode 100644 index 00000000000..27a7ad6a51e --- /dev/null +++ b/website/src/components/discourse/discourse.cy.js @@ -0,0 +1,77 @@ +import React from 'react' +import { DiscourseFeed } from './index' + +const mockResponseData = [ + { + "id": 4911, + "title": "Processing source tables with differing arrival times throughout day", + "has_accepted_answer": true, + "author": "MatthewMunn", + "like_count": 1, + "posts_count": 2 + }, +] + +describe('Test Discourse component', () => { + it('Should render same text passed into title prop', () => { + cy.mount() + cy.contains('Open topics').should('be.visible') + }) + + it('Should display loading icon on inital load', () => { + cy.intercept('POST', '**/.netlify/functions/get-discourse-topics', { + delayMs: 100, + } + ).as('getTopics') + cy.mount() + cy.get('[data-testid="feed-loader"]', { timeout: 100 }).should('be.visible') + cy.wait('@getTopics') + cy.get('[data-testid="error-text"]').should('be.visible') + + }) + + it('Should display Discourse data after API fetch', () => { + cy.intercept('POST', '**/.netlify/functions/get-discourse-topics', (req) => { + req.reply(mockResponseData); + }).as('getTopics') + + cy.mount() + cy.wait('@getTopics') + cy.get('[data-testid="topics-list"]').as('topicsList') + cy.get('@topicsList').should('have.length', 1) + cy.get('@topicsList').eq(0).get('span[title="Solved"]').should('be.visible') + cy.get('@topicsList').eq(0).should('contain.text', 'Processing source tables') + cy.get('@topicsList').eq(0).contains('MatthewMunn').should('be.visible') + cy.get('@topicsList').eq(0).contains('1 reply').should('be.visible') + cy.get('@topicsList').eq(0).contains('1 like').should('be.visible') + }) + + it('Should show cta with correct text and href', () => { + cy.intercept('POST', '**/.netlify/functions/get-discourse-topics', (req) => { + req.reply(mockResponseData); + }).as('getTopics') + + cy.mount() + cy.wait('@getTopics') + + cy.get('[data-testid="feed-cta"]').as('btn') + cy.get('@btn').should('exist') + cy.get('@btn').invoke('attr', 'href').should('eq', 'https://discourse.getdbt.com/c/help/19') + cy.get('@btn').invoke('attr', 'title').should('eq', 'See open topics') + }) + + it('Should display message when no topics found', () => { + cy.intercept('POST', '**/.netlify/functions/get-discourse-topics', (req) => { + req.reply([]); + }).as('getTopics') + + cy.mount() + cy.wait('@getTopics') + cy.get('[data-testid="error-text"]').should('be.visible') + + }) +}) diff --git a/website/src/components/discourse/index.js b/website/src/components/discourse/index.js index fe6b0dc1130..97ef08a5272 100644 --- a/website/src/components/discourse/index.js +++ b/website/src/components/discourse/index.js @@ -38,10 +38,8 @@ export const DiscourseFeed = ({ setLoading(true) setIsError(false) - // Build Netlify Function endpoint - const endpoint = window?.location?.hostname?.includes('localhost') - ? 'http://localhost:8888/.netlify/functions/get-discourse-topics' - : '/.netlify/functions/get-discourse-topics' + // Build function endpoint + const endpoint = `/api/get-discourse-topics` // If 'after' prop not passed in, set relative after date let afterDate = after @@ -96,7 +94,7 @@ export const DiscourseFeed = ({ // Set initial min-height // This is to avoid layout shifts // which affects Lighthouse performance scores - const setMinHeight = isError + const setMinHeight = isError || !topics?.length > 0 ? 'auto' : 414 @@ -147,7 +145,7 @@ export const DiscourseFeed = ({ )} {show_cta && ( - {link_text} + {link_text} )} ) @@ -201,7 +199,7 @@ export const DiscourseHelpFeed = ({ function TopicWrapper({ topic, children }) { if(topic?.slug && topic?.id) { return ( - {children} + {children} ) } else { return ( diff --git a/website/src/components/discourseBlogComments/discourseBlogComments.cy.js b/website/src/components/discourseBlogComments/discourseBlogComments.cy.js new file mode 100644 index 00000000000..3177c4ee97f --- /dev/null +++ b/website/src/components/discourseBlogComments/discourseBlogComments.cy.js @@ -0,0 +1,136 @@ +import React from 'react' +import { DiscourseBlogComments } from './index' + +const mockResponseData = { + topicId: 5650, + comments: [ + { + id: 9335, + name: "John Rock", + username: "john.rock", + avatar_template: + "/user_avatar/discourse.getdbt.com/john.rock/{size}/1430_2.png", + created_at: "2022-12-15T19:37:00.436Z", + cooked: + '

          Testing another comment.

          \n
          \n

          Nasdaq at large has always been a data driven company. Over the years how we access and model data has changed but the desire for data driven outcomes has not.

          \n
          \n
                <div>\n        <h2>Comments</h2>\n        {resultData()}\n      </div>\n
          ', + post_number: 2, + post_type: 1, + updated_at: "2022-12-15T19:37:00.436Z", + reply_count: 1, + reply_to_post_number: null, + quote_count: 0, + incoming_link_count: 1, + reads: 1, + readers_count: 0, + score: 10.2, + yours: false, + topic_id: 5650, + topic_slug: + "how-to-move-data-from-spreadsheets-into-your-data-warehouse", + display_username: "John Rock", + primary_group_name: null, + flair_name: null, + flair_url: null, + flair_bg_color: null, + flair_color: null, + version: 1, + can_edit: true, + can_delete: true, + can_recover: false, + can_wiki: true, + read: false, + user_title: "dbt Labs", + title_is_group: false, + bookmarked: false, + actions_summary: [ + { + id: 2, + can_act: true, + }, + { + id: 3, + can_act: true, + }, + { + id: 4, + can_act: true, + }, + { + id: 8, + can_act: true, + }, + { + id: 6, + can_act: true, + }, + { + id: 7, + can_act: true, + }, + ], + moderator: true, + admin: true, + staff: true, + user_id: 3019, + hidden: false, + trust_level: 4, + deleted_at: null, + user_deleted: false, + edit_reason: null, + can_view_edit_history: true, + wiki: false, + reviewable_id: 0, + reviewable_score_count: 0, + reviewable_score_pending_count: 0, + mentioned_users: [], + akismet_state: null, + user_created_at: "2022-12-08T15:24:08.563Z", + user_date_of_birth: null, + can_accept_answer: false, + can_unaccept_answer: false, + accepted_answer: false, + }, + ], +}; + +describe('Test DiscourseBlogComments component', () => { + it('Should display loading icon on inital load', () => { + cy.intercept('GET', '**/.netlify/functions/get-discourse-comments*', + { + delayMs: 100, + }).as('getComments') + + cy.mount() + cy.get('[data-testid="feed-loader"]', { timeout: 100 }).should('be.visible') + cy.wait('@getComments') + cy.get('[data-testid="error-text"]').should('be.visible') + + }) + + it('Should display Discourse data after API fetch', () => { + cy.intercept('GET', '**/.netlify/functions/get-discourse-comments*', mockResponseData).as('getComments') + + cy.mount() + cy.wait('@getComments') + cy.get('[data-testid="comments-list"]').as('commentsList') + cy.get('@commentsList').should('have.length', 1) + cy.get('@commentsList').eq(0).should('contain.text', 'Testing another comment.') + + }) + it("Should display error message if API fetch fails", () => { + cy.intercept('GET', '**/.netlify/functions/get-discourse-comments*', { + statusCode: 404 + }).as('getComments') + + cy.mount() + cy.wait('@getComments') + cy.get('[data-testid="error-text"]').should('be.visible') + + }); +}) diff --git a/website/src/components/discourseBlogComments/index.js b/website/src/components/discourseBlogComments/index.js new file mode 100644 index 00000000000..7684269f92a --- /dev/null +++ b/website/src/components/discourseBlogComments/index.js @@ -0,0 +1,128 @@ +import React, { useState, useEffect } from 'react' +import styles from './styles.module.css' +import axios from 'axios' +import sanitizeHtml from 'sanitize-html'; + +export const DiscourseBlogComments = ({title,slug}) => { + + const DISCOURSE_TOPIC_ENDPOINT = `https://discourse.getdbt.com/t/` + const commentsToLoad = 6 + + const [postSlug, setPostSlug] = useState(slug) + const [comments, setComments] = useState([]) + const [topicId, setTopicId] = useState(null) + const [loading, setLoading] = useState(true) + const [isError, setIsError] = useState(false) + const [next, setNext] = useState(commentsToLoad) + + // Handle loading more comments + const loadMoreComments = () => { + setNext(next + commentsToLoad) + } + + useEffect(() => { + let isMounted = true + + setPostSlug(slug) + + const fetchData = async () => { + try { + + const endpoint = `/api/get-discourse-comments?title=${title}&slug=${slug}` + + const { data } = await axios.get(endpoint) + + // Set error state if data not available + if(!data) throw new Error('Unable to get latest topics.') + + // Set topics count + if(isMounted && data) { + setComments(data.comments) + setTopicId(data.topicId) + setLoading(false) + + } + + } catch(err) { + setIsError(true) + setLoading(false) + } + } + fetchData() + + return () => { + isMounted = false + } + + }, [postSlug]) + + const resultData = () => { + if (loading) { + return Loading + } else if (isError) { + return

          Error loading comments. Please try again later.

          + } else if (!comments?.length && !isError) { + return ( + + ) + } else { + return ( +
          +
            + {comments?.slice(0, next)?.map((comment) => ( +
          • + {" "} +
            + {comment.username}{" "} + + {comment.user_title} + +
            +
            +
          • + ))} + {next < comments?.length && ( + + )} + + Continue discussion + +
          +
          + ); + } + } + + return ( +
          +

          Comments

          + {resultData()} +
          + ) + } diff --git a/website/src/components/discourseBlogComments/index.test.js b/website/src/components/discourseBlogComments/index.test.js new file mode 100644 index 00000000000..d7feb39c12d --- /dev/null +++ b/website/src/components/discourseBlogComments/index.test.js @@ -0,0 +1,154 @@ +import React from "react"; +import axios from "axios"; +import { render, screen, act } from "@testing-library/react"; +import { DiscourseBlogComments } from "./index"; + +// Mock api data +const mockAxiosResponse = { + data: { + topicId: 5650, + comments: [ + { + id: 9335, + name: "John Rock", + username: "john.rock", + avatar_template: + "/user_avatar/discourse.getdbt.com/john.rock/{size}/1430_2.png", + created_at: "2022-12-15T19:37:00.436Z", + cooked: + '

          Testing another comment.

          \n
          \n

          Nasdaq at large has always been a data driven company. Over the years how we access and model data has changed but the desire for data driven outcomes has not.

          \n
          \n
                <div>\n        <h2>Comments</h2>\n        {resultData()}\n      </div>\n
          ', + post_number: 2, + post_type: 1, + updated_at: "2022-12-15T19:37:00.436Z", + reply_count: 1, + reply_to_post_number: null, + quote_count: 0, + incoming_link_count: 1, + reads: 1, + readers_count: 0, + score: 10.2, + yours: false, + topic_id: 5650, + topic_slug: + "how-to-move-data-from-spreadsheets-into-your-data-warehouse", + display_username: "John Rock", + primary_group_name: null, + flair_name: null, + flair_url: null, + flair_bg_color: null, + flair_color: null, + version: 1, + can_edit: true, + can_delete: true, + can_recover: false, + can_wiki: true, + read: false, + user_title: "dbt Labs", + title_is_group: false, + bookmarked: false, + actions_summary: [ + { + id: 2, + can_act: true, + }, + { + id: 3, + can_act: true, + }, + { + id: 4, + can_act: true, + }, + { + id: 8, + can_act: true, + }, + { + id: 6, + can_act: true, + }, + { + id: 7, + can_act: true, + }, + ], + moderator: true, + admin: true, + staff: true, + user_id: 3019, + hidden: false, + trust_level: 4, + deleted_at: null, + user_deleted: false, + edit_reason: null, + can_view_edit_history: true, + wiki: false, + reviewable_id: 0, + reviewable_score_count: 0, + reviewable_score_pending_count: 0, + mentioned_users: [], + akismet_state: null, + user_created_at: "2022-12-08T15:24:08.563Z", + user_date_of_birth: null, + can_accept_answer: false, + can_unaccept_answer: false, + accepted_answer: false, + }, + ], + }, +}; + +describe("Test DiscourseBlogComments component", () => { + // returns mocks to original state (ex: window = undefined) + afterEach(() => jest.restoreAllMocks()); + + it("Should display loading icon on inital load", () => { + render(); + const img = screen.getByTestId("feed-loader"); + expect(img).toBeInTheDocument(); + }); + + it("Should display Discourse data after API fetch", async () => { + // Get mock api response + jest.spyOn(axios, "get").mockResolvedValueOnce(mockAxiosResponse); + + act(async () => + render( + + ) + ); + // Comment should exist in document + const commentText = await screen.findByText(/Testing another comment./i); + expect(commentText).toBeInTheDocument(); + }); + + it("Should display error message if API fetch fails", async () => { + jest.spyOn(axios, "post").mockResolvedValueOnce(''); + // Simulate a 404 error from API + jest.spyOn(axios, "get").mockRejectedValueOnce({ + response: { + status: 404, + statusText: 'Not Found' + } + }); + + act(async () => + render( + + ) + ); + + // Error message should display + const errorMessage = await screen.findByTestId("error-text"); + expect(errorMessage).toBeInTheDocument(); + + }); +}); diff --git a/website/src/components/discourseBlogComments/styles.module.css b/website/src/components/discourseBlogComments/styles.module.css new file mode 100644 index 00000000000..ce1550d8de4 --- /dev/null +++ b/website/src/components/discourseBlogComments/styles.module.css @@ -0,0 +1,79 @@ +.commentList { + list-style: none; + padding: 0; + margin: 0; +} + +.commentContainer h4 { + font-size: 2rem; + margin-top: 3rem; +} + +.commentList > li { + margin-bottom: 2rem; +} + +.commentContainer ul li pre { + max-height: 400px; + height: 100%; +} + +.commentContainer ul li ul { + list-style: disc; + margin-bottom: 1rem; +} + +.commentContainer ul li ol { + list-style: number; + margin-bottom: 1rem; +} + +.commentContainer ul li aside { + margin: 1rem; + padding: .5rem; + opacity: .7; + border-left: var(--ifm-blockquote-border-left-width) solid var(--ifm-blockquote-border-color); +} + +.commentContainer ul li aside blockquote { + border-left: 0; +} + +.commentContainer ul li aside img{ + display: none; +} + +.commentContainer .username { + font-weight: 700; +} + +.commentContainer .userTitle { + opacity: .7; +} + +/* Cta */ +.discourseCta { + font-size: 1.1rem; + margin-top: auto; + color: #fff; + border: none; + } +.discourseCta:hover { + background: var(--color-light-teal); + color: #fff; +} + +.loadMoreCta { + display: block; + padding: .5rem 0; + margin-bottom: 3rem; +} + +.loadMoreCta:hover { + text-decoration: none; +} + +.loadMoreCta:after { + content: ' »'; +} + diff --git a/website/src/components/docCarousel/index.js b/website/src/components/docCarousel/index.js new file mode 100644 index 00000000000..4f6a11b644c --- /dev/null +++ b/website/src/components/docCarousel/index.js @@ -0,0 +1,49 @@ +import React from 'react'; +import { Swiper, SwiperSlide } from 'swiper/react'; +import 'swiper/css'; +import { Navigation, Pagination } from 'swiper'; +import 'swiper/css/navigation'; +import 'swiper/css/pagination'; + +function DocCarousel({ slidesPerView = 3, children }) { + if ( !children?.length > 0 ){ + return false + } + + // Limit slidesPerView to max of 4 + if(slidesPerView > 4) { + slidesPerView = 4 + } + + return ( +
          + + {children.map((item) => {item})} + +
          + ); +} + +export default DocCarousel; + diff --git a/website/src/components/faqList/index.js b/website/src/components/faqList/index.js index e29cbfddd54..62c0041c9a8 100644 --- a/website/src/components/faqList/index.js +++ b/website/src/components/faqList/index.js @@ -1,5 +1,4 @@ import React from 'react'; -import styles from './styles.module.css'; const files = require.context( '../../../docs/faqs', @@ -30,7 +29,7 @@ files.keys().forEach(function (key, i) { ); }); -function FAQList({ children, style }) { +function FAQList({ style }) { return (
          {faqs} diff --git a/website/src/components/faqs/index.js b/website/src/components/faqs/index.js index 4f07d9127f4..52c4573d883 100644 --- a/website/src/components/faqs/index.js +++ b/website/src/components/faqs/index.js @@ -2,18 +2,18 @@ import React, { useState, useEffect } from 'react'; import styles from './styles.module.css'; import { usePluginData } from '@docusaurus/useGlobalData'; -function FAQ({ children, src, alt_header = null }) { +function FAQ({ path, alt_header = null }) { const [isOn, setOn] = useState(false); - const [filePath, setFilePath] = useState(src) + const [filePath, setFilePath] = useState(path) const [fileContent, setFileContent] = useState({}) // Get all faq file paths from plugin const { faqFiles } = usePluginData('docusaurus-build-global-data-plugin'); useEffect(() => { - // Search for faq where frontmatter ID matches src prop - const faqFile = faqFiles.find(file => file.id === src) + // Search for faq where frontmatter ID matches path prop + const faqFile = faqFiles.find(file => file.id === path) // If faqFile found with ID, set filePath for this file if (faqFile?.id) { diff --git a/website/src/components/faqs/styles.module.css b/website/src/components/faqs/styles.module.css index baeecd2c2f5..e19156a3a7b 100644 --- a/website/src/components/faqs/styles.module.css +++ b/website/src/components/faqs/styles.module.css @@ -29,7 +29,10 @@ margin-bottom: 10px; padding: 20px; background-color: #e3f8f8; - color: var(--ifm-color-gray-900); +} + +:local(html[data-theme='dark'] .body) { + background: #333b47; } :local(.body > p:last-child) { diff --git a/website/src/components/hero/index.js b/website/src/components/hero/index.js index b6dbe7e245b..e4bef8e234b 100644 --- a/website/src/components/hero/index.js +++ b/website/src/components/hero/index.js @@ -1,16 +1,15 @@ import React from 'react'; import styles from './styles.module.css'; -import { useColorMode } from '@docusaurus/theme-common'; - -function Hero({ heading, subheading, showGraphic = false }) { - const { isDarkTheme } = useColorMode(); +function Hero({ heading, subheading, showGraphic = false, customStyles = {}, classNames = '', colClassNames = '' }) { return ( -
          -
          +
          + {showGraphic && ( +
          + )}
          -
          +

          {heading}

          {subheading}

          diff --git a/website/src/components/hero/styles.module.css b/website/src/components/hero/styles.module.css index efacee33f6e..f596b53762a 100644 --- a/website/src/components/hero/styles.module.css +++ b/website/src/components/hero/styles.module.css @@ -29,7 +29,7 @@ .showGraphic{ position: absolute; left: -50px; - top:150px; + top:125px; content:''; display: block; width: 25vw; diff --git a/website/src/components/hubspotForm/index.js b/website/src/components/hubspotForm/index.js new file mode 100644 index 00000000000..01fd8da5f8d --- /dev/null +++ b/website/src/components/hubspotForm/index.js @@ -0,0 +1,41 @@ +import React, { useEffect } from 'react' + +export const HubSpotForm = (props) => { + const { + region, + portalId, + formId, + containerId = 'hubspotForm', + sfdcCampaignId = '', + } = props + const formatContainerId = '#' + containerId + useEffect(() => { + // Only add hubspot script if not already exists + let script = document.getElementById('hubspot-v2-script') + if (!script) { + const newScript = document.createElement('script') + newScript.src = '//js.hsforms.net/forms/v2.js' + newScript.id = 'hubspot-v2-script' + document.body.appendChild(newScript) + script = newScript + } + + script.addEventListener('load', () => { + if (window.hbspt) { + window.hbspt.forms.create({ + region: region, + portalId: portalId, + formId: formId, + target: formatContainerId, + sfdcCampaignId: sfdcCampaignId, + }) + } + }) + }, []) + + return ( +
          +
          +
          + ) +} diff --git a/website/src/components/hubspotForm/styles.module.css b/website/src/components/hubspotForm/styles.module.css new file mode 100644 index 00000000000..6ec8f2e3da6 --- /dev/null +++ b/website/src/components/hubspotForm/styles.module.css @@ -0,0 +1,80 @@ +:global(.hs-form) { + position: relative; + max-width: 976px; + width: 100%; + z-index: 2; + margin: 4rem auto; + background: #fff; + border-radius: 24px; + box-shadow: 0 0 20px 10px rgba(0, 0, 0, 0.05); + padding: 50px; + font: var(--ifm-font-size-base) / var(--ifm-line-height-base) var(--ifm-font-family-base) +} + +@media(max-width: 996px) { + :global(.hs-form) { + padding: 30px 20px; + } +} + +[data-theme='dark'] :global(.hs-form) { + background: #505D73; +} + +:global(.hs-form) label { + margin-bottom: 0.5rem; +} + +:global(.hs-form-required) { + color: #de1c22; +} + +:global(.hs-form-field) { + margin-bottom: 1rem; +} + +:global(.hs-input) { + width: 100%; + height: 2.5rem; + background: rgba(239, 242, 243, 0.5); + border: none; + border-radius: 8px; + padding: 0.375rem 0.75rem; + transition: border-color 0.15s ease-in-out, box-shadow 0.15s ease-in-out; +} + +:global(.hs-input:focus) { + border-color: #93959c; + outline: 0; + box-shadow: 0 0 0 0.25rem rgba(38, 42, 56, 0.25) +} + +[data-theme='dark'] :global(.hs-input) { + color: #212529; +} + +textarea:global(.hs-input) { + height: 7rem; + font-family: var(--ifm-font-family-base); +} + +:global(.hs-recaptcha) { + display: none; +} + +:global(.hs-submit) { + margin-top: 3rem; + text-align: center; +} + +:global(.hs-submit .hs-button) { + color: var(--color-white); +} + +:global(.hs-submit .hs-button:hover) { + background: var(--color-light-teal); +} + +:global(.submitted-message) { + text-align: center; +} diff --git a/website/src/components/lightbox/index.js b/website/src/components/lightbox/index.js index d312e67ca8f..1c748bbb04f 100644 --- a/website/src/components/lightbox/index.js +++ b/website/src/components/lightbox/index.js @@ -1,36 +1,50 @@ import React from 'react'; - import styles from './styles.module.css'; +import imageCacheWrapper from '../../../functions/image-cache-wrapper'; + +function Lightbox({ + src, + collapsed = false, + alignment = "center", + alt = undefined, + title = undefined, + width = undefined, +}) { -function Lightbox({children, src, title, collapsed}) { - var collapsedClass = !!collapsed ? styles.collapsed : ''; + // Set alignment class if alignment prop used + let imageAlignment = '' + if(alignment === "left") { + imageAlignment = styles.leftAlignLightbox + } else if(alignment === "right") { + imageAlignment = styles.rightAlignLightbox + } - const featherlightStyles = ` - /** - * Featherlight - ultra slim jQuery lightbox - * Version 1.7.14 - http://noelboss.github.io/featherlight/ - * - * Copyright 2019, Noël Raoul Bossart (http://www.noelboss.com) - * MIT Licensed. - **/ - html.with-featherlight{overflow:hidden}.featherlight{display:none;position:fixed;top:0;right:0;bottom:0;left:0;z-index:2147483647;text-align:center;white-space:nowrap;cursor:pointer;background:#333;background:rgba(0,0,0,0)}.featherlight:last-of-type{background:rgba(0,0,0,.8)}.featherlight:before{content:'';display:inline-block;height:100%;vertical-align:middle}.featherlight .featherlight-content{position:relative;text-align:left;vertical-align:middle;display:inline-block;overflow:auto;padding:25px 25px 0;border-bottom:25px solid transparent;margin-left:5%;margin-right:5%;max-height:95%;background:#fff;cursor:auto;white-space:normal}.featherlight .featherlight-inner{display:block}.featherlight link.featherlight-inner,.featherlight script.featherlight-inner,.featherlight style.featherlight-inner{display:none}.featherlight .featherlight-close-icon{position:absolute;z-index:9999;top:0;right:0;line-height:25px;width:25px;cursor:pointer;text-align:center;font-family:Arial,sans-serif;background:#fff;background:rgba(255,255,255,.3);color:#000;border:0;padding:0}.featherlight .featherlight-close-icon::-moz-focus-inner{border:0;padding:0}.featherlight .featherlight-image{width:100%}.featherlight-iframe .featherlight-content{border-bottom:0;padding:0;-webkit-overflow-scrolling:touch}.featherlight iframe{border:0}.featherlight *{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}@media only screen and (max-width:1024px){.featherlight .featherlight-content{margin-left:0;margin-right:0;max-height:98%;padding:10px 10px 0;border-bottom:10px solid transparent}}@media print{html.with-featherlight>*>:not(.featherlight){display:none}} - ` return ( - - -
          - - - {title} - - -

          { title }

          -
          -
          + <> + + + + + {alt + + + {title && ( + { title } + )} + + ); } diff --git a/website/src/components/lightbox/styles.module.css b/website/src/components/lightbox/styles.module.css index 4c33e5173e5..af0bb086cf5 100644 --- a/website/src/components/lightbox/styles.module.css +++ b/website/src/components/lightbox/styles.module.css @@ -1,18 +1,26 @@ - :local(.title) { - text-align: center; - font-size: small; - width: 100%; + text-align: center; + font-size: small; + width: 100%; + display: block; } :local(.docImage) { - filter: drop-shadow(4px 4px 6px #aaaaaa33); - margin: 10px auto; - display: block; - max-width: 400px; + filter: drop-shadow(4px 4px 6px #aaaaaa33); + margin: 10px auto; + padding-right: 10px; + display: block; + max-width: 400px; } :local(.collapsed) { - max-width: 400px !important; - display: inline-block; + display: inline-block; + padding: 0 5px; +} + +.leftAlignLightbox { + margin: 10px 0; +} +.rightAlignLightbox { + margin: 10px 0 10px auto; } diff --git a/website/src/components/lineage/index.js b/website/src/components/lineage/index.js new file mode 100644 index 00000000000..6c22e2bae99 --- /dev/null +++ b/website/src/components/lineage/index.js @@ -0,0 +1,38 @@ +import React, { useEffect, useState } from 'react'; +import { transformLineageNodes } from './utils/transform-lineage-nodes'; + +let Dag = null; + +try { + /** As a private package, not every developer will have access to this repo. */ + // const DagImport = require('@dbt-labs/react-dbt-dag'); + // require('@dbt-labs/react-dbt-dag/dag.css'); + // require('@dbt-labs/react-dbt-dag/dag.standalone.css'); + + // Dag = DagImport.Dag; +} catch (err) { + /** + * react-dbt-dag is a private repo. Not all developers of the + * open source docs will have access to this repo. + */ +} + +export const Lineage = ({ nodes, currentNodeId, onNodeSelect }) => { + const [dagNodes, setDagNodes] = useState([]) + + useEffect(() => { + const transformedNodes = transformLineageNodes(nodes); + setDagNodes(transformedNodes) + }, [nodes]) + + if (!Dag) { + return
          Lineage Not Available!
          + } + + return { + onNodeSelect(targetNode) + }} /> +} diff --git a/website/src/components/lineage/utils/transform-lineage-nodes.js b/website/src/components/lineage/utils/transform-lineage-nodes.js new file mode 100644 index 00000000000..023d5cfbf18 --- /dev/null +++ b/website/src/components/lineage/utils/transform-lineage-nodes.js @@ -0,0 +1,23 @@ +/** + * The manifest represents nodes in one format, + * while the DAG expects nodes in + * [another format](https://github.com/dbt-labs/react-dbt-dag/blob/main/src/dag/InternalDagInterfaces.ts#L34-L48). + */ +export const transformLineageNodes = (nodes) => { + const dagNodes = Object.entries(nodes || {}).map(([nodeId, node]) => { + return { + id: nodeId, + parents: node.depends_on?.nodes || [], + label: node.name, + resourceType: node.resource_type, + data: { + package_name: node.package_name, + resource_type: node.resource_type, + node_name: nodeId, + file_name: node.name, + }, + } + }); + + return dagNodes; +} \ No newline at end of file diff --git a/website/src/components/link/index.js b/website/src/components/link/index.js deleted file mode 100644 index 81bcfd71c6a..00000000000 --- a/website/src/components/link/index.js +++ /dev/null @@ -1,123 +0,0 @@ -import React from 'react'; - -import Link from '@docusaurus/Link'; - -const ENV = process ? process.env : {}; - -const docsFiles = require.context( - '../../../docs/', - true, - /\.md$/ -); - -var slugs = {}; -var sources = {}; -docsFiles.keys().forEach(function(key, i) { - var doc = docsFiles(key); - var meta = doc.metadata; - if (!meta) { - // how? - return - } - - var slugParts = meta.id.split("/"); - var slug = slugParts[slugParts.length - 1]; - - if (slugs[slug]) { - var message = `Duplicate slug found: ${slug}\n`; - message += ` - ${meta.source}\n`; - message += ` - ${slugs[slug].source}`; - console.error(message); - } - - slugs[slug] = meta; - sources[meta.source] = meta -}); - -function findSource(source_href) { - var stripped_source_href = source_href.replace(/.md$/, '') - if (!stripped_source_href.startsWith('/')) { - stripped_source_href = '/' + stripped_source_href; - } - var found = null; - for (var source in sources) { - var stripped_source = source.replace(/.md$/, '') - var is_match = stripped_source.endsWith(stripped_source_href); - if (is_match && !found) { - found = sources[source]; - } else if (is_match && found) { - // The link is ambiguous. Pick one, but error? - var msg = ( - `Ambiguous link slug: "${source_href}"\n` - + `- Two matched:, "${found.id}", "${sources[source].id}"` - ); - - console.error(msg); - if (ENV.DOCS_ENV == 'build') { - throw new Error(`Ambiguous link detected: ${msg}`) - } - } - } - return found; -} - -function expandRelativeLink(href, ignoreInvalid) { - if (!href) { - //throw new Error(`Broken link detected (href is undefined)`) - // how does this happen? - return {bad: true, link: href}; - } - - var [link, hash] = href.split("#") - - if (!link && hash) { - // relative in-page link, it's fine - return {bad: false, link: href}; - } else if (link && !hash) { - // hash is missing, that's also very ok - hash = '' - } - - var isExternal = !!link.match(/https?:/) || !!link.match(/:/); - - var sourceLink = findSource(link); - if (sourceLink) { - return { - bad: false, - link: `${sourceLink.permalink}#${hash}` - } - } else if (slugs[link]) { - return { - bad: false, - link: `${slugs[link].permalink}#${hash}` - } - } else if (!isExternal && !href.startsWith('/')) { - if (ENV.DOCS_ENV == 'build' && !ignoreInvalid) { - console.log(` - Broken link detected ${href}`); - throw new Error("Broken link") - } else { - return { - bad: true, - link: href - } - } - } - - return {bad: false, link: href}; -} - - -function CheckedLink({children, href, ignoreInvalid, className}) { - var style = {}; - const {bad, link} = expandRelativeLink(href, ignoreInvalid); - if (bad) { - style['borderBottom'] = '2px dashed #ff6961' - } - - return ( - {children} - ); -} - -export default CheckedLink; - diff --git a/website/src/components/loom/index.js b/website/src/components/loom/index.js index b409643fb56..c2cb52592de 100644 --- a/website/src/components/loom/index.js +++ b/website/src/components/loom/index.js @@ -2,7 +2,7 @@ import React from 'react'; import styles from './styles.module.css'; -function LoomVideo({children, id}) { +function LoomVideo({id}) { return (
          diff --git a/website/src/components/postCarousel/index.js b/website/src/components/postCarousel/index.js index 0b93c2d29e5..c3f378376c3 100644 --- a/website/src/components/postCarousel/index.js +++ b/website/src/components/postCarousel/index.js @@ -1,8 +1,8 @@ import React from 'react'; import BlogPostCard from '@site/src/components/blogPostCard'; import { Swiper, SwiperSlide } from 'swiper/react'; -import { Navigation } from 'swiper'; import 'swiper/css'; +import { Navigation } from 'swiper'; import 'swiper/css/navigation'; @@ -10,18 +10,20 @@ function PostCarousel({ blogPostData }) { return (
          diff --git a/website/src/components/quickstartGuideCard/index.js b/website/src/components/quickstartGuideCard/index.js new file mode 100644 index 00000000000..fdc629bd7b0 --- /dev/null +++ b/website/src/components/quickstartGuideCard/index.js @@ -0,0 +1,28 @@ +import React from "react"; +import Link from "@docusaurus/Link"; +import styles from "./styles.module.css"; +import getIconType from "../../utils/get-icon-type"; + +function QuickstartGuideCard({ frontMatter }) { + const { id, title, time_to_complete, icon } = frontMatter; + return ( + + {icon && getIconType(icon, styles.icon)} + +

          {title}

          + + {time_to_complete && ( + {time_to_complete} + )} + + + Start + + + ); +} + +export default QuickstartGuideCard; diff --git a/website/src/components/quickstartGuideCard/styles.module.css b/website/src/components/quickstartGuideCard/styles.module.css new file mode 100644 index 00000000000..8202f694fcd --- /dev/null +++ b/website/src/components/quickstartGuideCard/styles.module.css @@ -0,0 +1,65 @@ +.quickstartCard { + border: 1px solid #EFF2F3; + border-radius: var(--border-radius); + box-shadow: 0px 11px 24px rgba(138, 138, 138, .1); + padding: 2.5rem 2.5rem 1.5rem 2.5rem; + flex: 0 0 30%; + border-bottom: solid 4px var(--color-light-teal); + display: flex; + flex-direction: column; + text-decoration: none !important; + transition: all 0.2s ease-in-out; +} + +.quickstartCard:hover { + border-bottom-color: var(--color-orange); + transform: translateY(-7px); +} + +.quickstartCard .icon { + max-width: 25px; + font-size: 25px; + margin-bottom: .8rem; + color: var(--ifm-menu-color); +} + +[data-theme='dark'] .quickstartCard .icon { + color: #fff; +} + +.quickstartCard h3 { + font-weight: 600; + color:#262A38; + font-size: 1.5rem; +} + +[data-theme='dark'] .quickstartCard h3 { + color: #fff; +} + +.quickstartCard h3:hover { + text-decoration: none; +} + +.quickstartCard .time_to_complete { + color:var(--ifm-menu-color) +} + +.quickstartCard .start { + font-size: 1.125rem; + margin-top: auto; + padding-top: 2rem; +} + +[data-theme='dark'] .quickstartCard .start { + color: #fff; +} + +[data-theme='dark'] .quickstartCard:hover .start { + text-decoration: underline; +} + +.quickstartCard .start:after { + content: " →"; + margin-left: 5px; +} diff --git a/website/src/components/quickstartGuideList/index.js b/website/src/components/quickstartGuideList/index.js new file mode 100644 index 00000000000..954d54e6d47 --- /dev/null +++ b/website/src/components/quickstartGuideList/index.js @@ -0,0 +1,49 @@ +import React from 'react'; +import Head from '@docusaurus/Head'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import Layout from '@theme/Layout'; +import Hero from '@site/src/components/hero'; +import QuickstartGuideCard from '../quickstartGuideCard' +import styles from './styles.module.css'; + +const quickstartTitle = 'Quickstarts' +const quickstartDescription = 'dbt Core is a powerful open-source tool for data transformations and dbt Cloud is the fastest and most reliable way to deploy your dbt jobs. With the help of a sample project, learn how to quickly start using dbt and one of the most common data platforms.' + +function QuickstartList({ quickstartData }) { + const { siteConfig } = useDocusaurusContext() + + // Build meta title from quickstartTitle and docusaurus config site title + const metaTitle = `${quickstartTitle}${siteConfig?.title ? ` | ${siteConfig.title}` : ''}` + + return ( + + + {metaTitle} + + + + +
          +
          + {quickstartData && quickstartData.length > 0 ? ( + <> + {quickstartData.map((guide, i) => ( + + ))} + + ) : +

          No quickstarts are available at this time. 😕

          + } +
          +
          +
          + ) +} + +export default QuickstartList diff --git a/website/src/components/quickstartGuideList/styles.module.css b/website/src/components/quickstartGuideList/styles.module.css new file mode 100644 index 00000000000..8c4e45edc8c --- /dev/null +++ b/website/src/components/quickstartGuideList/styles.module.css @@ -0,0 +1,29 @@ +.quickstartHero h1 { + font-size: clamp(var(--ifm-h1-font-size) * 0.9375, 1vw, var(--ifm-h1-font-size)); + --ifm-h1-font-size: 3rem; + +} + +@media (max-width: 576px) { + .quickstartHero h1 { + --ifm-h1-font-size: 2rem; + } +} + +.quickstartHero p { + font-size: 1.25rem; + margin: 0 auto; +} + +.quickstartCardContainer { + display: grid; + grid-template-columns: 1fr 1fr 1fr; + grid-gap: 2rem; + padding: 5rem 1rem; +} + +@media (max-width: 996px) { + .quickstartCardContainer { + grid-template-columns: 1fr; + } +} diff --git a/website/src/components/quickstartTOC/index.js b/website/src/components/quickstartTOC/index.js new file mode 100644 index 00000000000..8c9b8fba910 --- /dev/null +++ b/website/src/components/quickstartTOC/index.js @@ -0,0 +1,209 @@ +// This component is used to build the functionality of the quickstart guides +// Each H2 (##) in the markdown file is a step in the guide + +import React, { useState, useEffect } from "react"; +import clsx from "clsx"; +import style from "./styles.module.css"; +import { useLocation, useHistory } from "@docusaurus/router"; +import queryString from "query-string"; + +function QuickstartTOC() { + const history = useHistory(); + const location = useLocation() + const locationPath = useLocation().pathname; + const queryParams = queryString.parse(location.search); + + const activeStepLocal = typeof localStorage !== "undefined" ? localStorage.getItem(locationPath) : 1; + const activeStepParam = queryParams.step ? (queryParams.step) : activeStepLocal ? activeStepLocal : 1; + + const [mounted, setMounted] = useState(false); + const [tocData, setTocData] = useState([]); + const [activeStep, setActiveStep] = useState(activeStepLocal || 1); + + useEffect(() => { + // Get all h2 for each step in the guide + const quickstartContainer = document.querySelector(".quickstart-container"); + const steps = quickstartContainer.querySelectorAll("h2"); + const snippetContainer = document.querySelectorAll(".snippet"); + + // Create an array of objects with the id and title of each step + const data = Array.from(steps).map((step, index) => ({ + id: step.id, + title: step.innerText, + stepNumber: index + 1, + })); + + setTocData(data); + setMounted(true); + setActiveStep(activeStepParam || parseInt(activeStepLocal) || 1); + + // Wrap all h2 (steps), along with all of their direct siblings, in a div until the next h2 + if (mounted) { + // Add snippet container to its parent step + snippetContainer.forEach((snippet) => { + const parent = snippet?.parentNode; + while (snippet?.firstChild && parent.className) { + if (parent) { + parent.insertBefore(snippet.firstChild, snippet); + } + } + }); + + steps.forEach((step, index) => { + const wrapper = document.createElement("div"); + wrapper.classList.add(style.stepWrapper); + wrapper.classList.add(style.hidden); + + // Move the step and all its siblings into the its own div + step.parentNode.insertBefore(wrapper, step); + let currentElement = step; + do { + const nextElement = currentElement.nextElementSibling; + wrapper.appendChild(currentElement); + currentElement = nextElement; + wrapper.setAttribute("data-step", index + 1); + } while (currentElement && currentElement.tagName !== "H2"); + }); + + // Find the active step and show it + const activeStepWrapper = document.querySelector( + `.${style.stepWrapper}[data-step="${activeStep}"]` + ); + activeStepWrapper?.classList.remove(style.hidden); + } + + // Add Next/Prev buttons to step-wrapper divs + if (mounted) { + const stepWrappers = document.querySelectorAll(`.${style.stepWrapper}`); + + stepWrappers.forEach((stepWrapper, index) => { + const buttonContainer = document.createElement("div"); + buttonContainer.classList.add(style.buttonContainer); + const prevButton = document.createElement("a"); + const nextButton = document.createElement("a"); + + prevButton.textContent = "Back"; + prevButton.classList.add(clsx(style.button, style.prevButton)); + prevButton.disabled = index === 0; + prevButton.addEventListener("click", () => handlePrev(index + 1)); + + nextButton.textContent = "Next"; + nextButton.classList.add(clsx(style.button, style.nextButton)); + nextButton.disabled = index === stepWrappers.length - 1; + nextButton.addEventListener("click", () => handleNext(index + 1)); + + buttonContainer.appendChild(prevButton); + buttonContainer.appendChild(nextButton); + + stepWrapper.appendChild(buttonContainer); + + // Hide the respective buttons on the first and last steps + if (index === 0) { + prevButton.classList.add(style.hidden); + } + if (index === stepWrappers.length - 1) { + nextButton.classList.add(style.hidden); + } + }); + + const quickstartTitle = document.querySelector("header h1"); + quickstartTitle.classList.add(style.quickstartTitle); + } + }, [mounted]); + + useEffect(() => { + const tocItems = document.querySelectorAll(`.${style.tocItem}`); + + tocItems.forEach((item, i) => { + const isActive = i <= activeStep - 1; + + item.classList.toggle(clsx(style.active), isActive); + }); + + // Scroll to the top of the page when the user clicks next + if (window.scrollY > 0) { + window.scrollTo(0, 0); + } + + // Set local storage to the active step + localStorage.setItem(locationPath, activeStep); + + // If on mobile, auto scroll to the active step in the TOC when activeStep updates + const tocList = document.querySelector(`.${style.tocList}`); + const activeItems = document.querySelectorAll(`.${style.active}`); + + // Add query param for the active step + history.replace({ + search: queryString.stringify({ + ...queryParams, + step: activeStep, + }), + }); + + if (window.innerWidth < 996) { + const activeItem = activeItems[activeItems.length - 1]; + + if (activeItem) { + const itemTop = activeItem.offsetTop; + const itemHeight = activeItem.offsetHeight; + const containerTop = tocList.scrollTop; + const containerHeight = tocList.offsetHeight; + + if (itemTop < containerTop || itemTop + itemHeight > containerTop + containerHeight) { + tocList.scrollTop = itemTop; + } + } + } + }, [activeStep, mounted]); + + // Handle updating the active step + const updateStep = (currentStepIndex, newStepIndex) => { + const currentStep = document.querySelector( + `.${style.stepWrapper}[data-step='${currentStepIndex}']` + ); + const newStep = document.querySelector( + `.${style.stepWrapper}[data-step='${newStepIndex}']` + ); + + currentStep?.classList.add(style.hidden); + newStep?.classList.remove(style.hidden); + + setActiveStep(newStepIndex); + }; + + const handleNext = (currentStepIndex) => { + if (currentStepIndex <= tocData.length - 1) { + updateStep(currentStepIndex, currentStepIndex + 1); + } + }; + + const handlePrev = (currentStepIndex) => { + if (currentStepIndex > 0) { + updateStep(currentStepIndex, currentStepIndex - 1); + } + }; + + // Handle TOC click + const handleTocClick = (e) => { + const stepNumber = e.target.closest("li").dataset.step; + + updateStep(activeStep, stepNumber); + }; + + return ( +
            + {tocData.map((step) => ( +
          • + {step.stepNumber} {step.title} +
          • + ))} +
          + ); +} + +export default QuickstartTOC; diff --git a/website/src/components/quickstartTOC/styles.module.css b/website/src/components/quickstartTOC/styles.module.css new file mode 100644 index 00000000000..edfd0380098 --- /dev/null +++ b/website/src/components/quickstartTOC/styles.module.css @@ -0,0 +1,116 @@ +.quickstartTitle { + padding: 1rem 0 2rem; +} + +.tocList { + list-style: none; + padding: 0; + margin: 0; + width: 370px; + flex-shrink: 0; + padding-right: 3rem; +} + +.tocList li { + padding: 1rem; + display: block; + border: 1px solid #EFF2F3; + box-shadow: 0px 11px 24px rgba(138, 138, 138, 0.1), 0px 0px 0px rgba(138, 138, 138, 0.1); + border-radius: 10px; + margin-bottom: 1rem; + display: grid; + grid-template-columns: 1fr 5fr; + cursor: pointer; + color: var(--color-primary-blue); + text-decoration: none; +} + +.tocList li span { + border-radius: 50%; + display: inline-block; + width: 30px; + height: 30px; + text-align: center; + line-height: 27px; + color: var(--color-light-teal); + border: solid 1px var(--color-light-teal); + margin-bottom: auto; +} + +.tocList .active span { + background: var(--color-light-teal); + color: var(--color-white); +} + +html[data-theme="dark"] .tocList li { + color: var(--color-white); +} + +html[data-theme="dark"] .tocList li span { + border-color: var(--color-white); + color: var(--color-white); +} + +html[data-theme="dark"] .tocList .active span { + border-color: var(--color-light-teal); +} + +.tocItem { + display: flex; + align-items: center; + justify-content: space-between; +} + +.buttonContainer { + display: flex; + justify-content: space-between; + margin: 2rem 0; +} + +.buttonContainer a { + padding: .5rem; + cursor: pointer; + transition-property: color, background, border-color; + transition-duration: var(--ifm-button-transition-duration); + transition-timing-function: var(--ifm-transition-timing-default); + border: 2px solid var(--color-light-teal); + border-radius: 5px; + width: 125px; + text-align: center; +} + +.buttonContainer a:hover { + background: var(--color-light-teal); + color: var(--color-white) +} + +.buttonContainer .prevButton { + margin-right: auto; +} + +.buttonContainer .nextButton { + margin-left: auto; +} + +.stepWrapper[data-step="1"] .nextButton { + background: var(--color-light-teal); + color: var(--color-white) +} + +.stepWrapper.hidden { + display: none; +} + +.hidden { + display: none; +} + +@media (max-width: 996px) { + .tocList { + width: 100%; + padding-right: 0; + margin-bottom: 2rem; + height: 160px; + overflow-y: auto; + } +} diff --git a/website/src/components/snippet/index.js b/website/src/components/snippet/index.js index d16381ac913..daf674c3d14 100644 --- a/website/src/components/snippet/index.js +++ b/website/src/components/snippet/index.js @@ -1,16 +1,17 @@ import React from 'react' import styles from './styles.module.css'; +import clsx from 'clsx'; {/* The Snippet component works in a similar way as FAQs. Pass the filename of a snippet within the snippets directory as a prop to use throughout the docs. */} -export default function Snippet({ src }) { - const file = require('../../../snippets/' + src + '.md') +export default function Snippet({ path }) { + const file = require('../../../snippets/' + path + '.md') const contents = file.default({}); return ( -
          +
          { contents }
          ) diff --git a/website/src/components/stoplight/index.js b/website/src/components/stoplight/index.js new file mode 100644 index 00000000000..bff43dd27c8 --- /dev/null +++ b/website/src/components/stoplight/index.js @@ -0,0 +1,27 @@ +import { API } from "@stoplight/elements"; +import React from "react"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +export default function Stoplight({ version }) { + if (!["v1", "v2", "v3", "private"].includes(version)) { + return null; + } + return ( + <> + + + + ); +} diff --git a/website/src/components/term/index.js b/website/src/components/term/index.js index c233647dc73..760acb736cb 100644 --- a/website/src/components/term/index.js +++ b/website/src/components/term/index.js @@ -11,7 +11,7 @@ import styles from './styles.module.css'; */} export default function Term({ id, children = undefined }) { - const [uniqueID, setUniqueID] = useState(String(Math.random())) + const [uniqueID] = useState(String(Math.random())) const [pageReady, setPageReady] = useState(false) // Rebuild tooltips on every update diff --git a/website/src/components/versionBlock/versionBlock.cy.js b/website/src/components/versionBlock/versionBlock.cy.js new file mode 100644 index 00000000000..b8411fedd7d --- /dev/null +++ b/website/src/components/versionBlock/versionBlock.cy.js @@ -0,0 +1,31 @@ +import React from 'react' +import VersionBlock from './index.js' +import VersionContext from '../../stores/VersionContext.js' + +describe('Test VersionBlock component', () => { + it('Should render same text passed into title prop', () => { + cy.mount(Versioned content) + cy.contains('Versioned content').should('be.visible') + }) + it('Should render content if current version >= firstVersion', () => { + cy.mount( + + + Current version + + + ) + cy.contains('Current version').should('exist') + }) + + it('Should not render content if current version less than firstVersion', () => { + cy.mount( + + + Current version + + + ) + cy.contains('Current version').should('not.exist') + }) +}) \ No newline at end of file diff --git a/website/src/components/wistia/index.js b/website/src/components/wistia/index.js index 1230b119f7f..23bd3f7e774 100644 --- a/website/src/components/wistia/index.js +++ b/website/src/components/wistia/index.js @@ -23,7 +23,7 @@ function WistiaVideo({id, paddingTweak = "56.25%"}) {
          - + ); } diff --git a/website/src/components/youtube/index.js b/website/src/components/youtube/index.js index ddf18a057d3..035b872afdb 100644 --- a/website/src/components/youtube/index.js +++ b/website/src/components/youtube/index.js @@ -1,6 +1,6 @@ import React from 'react'; -function YoutubeVideo({children, id}) { +function YoutubeVideo({id}) { return ( ); diff --git a/website/src/css/custom.css b/website/src/css/custom.css index d9eddd9de3b..fc51ef8a8ef 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -58,7 +58,7 @@ --pagination-icon-prev: "\2190"; --filter-brightness-low: 1.1; --filter-brightness-high: 1.5; - + --darkmode-link-color: #1FA4A3; --light-dark-toggle: ""; /* search overrides */ @@ -99,10 +99,17 @@ html[data-theme="dark"] { /* admonition icon */ --ifm-color-info-contrast-foreground: var(--ifm-color-gray-900); - --ifm-table-cell-color: var(--color-green-blue); } +/* Linked `code` tags visibility adjustment */ +html[data-theme=dark] a code { + color: var(--darkmode-link-color); +} +html[data-theme=dark] a code:hover { + color: var(--darkmode-link-color); +} + /* For /dbt-cloud/api REDOC Page */ html[data-theme="dark"] .api-content h2, html[data-theme="dark"] .api-content h3, @@ -115,11 +122,11 @@ html[data-theme="dark"] .api-content h1 { html[data-theme="dark"] .api-content button, html[data-theme="dark"] .api-content a { - filter: brightness(1.25); + filter: brightness(var(--filter-brightness-low)); } html[data-theme="dark"] .api-content a:hover { - filter: brightness(1.25); + filter: brightness(var(--filter-brightness-low)); } .redoc-wrap .api-content a, @@ -144,17 +151,33 @@ html[data-theme="dark"] .api-content { color: white; } -html[data-theme="dark"] .api-content table td > span, +html[data-theme="dark"] .api-content table td>span, html[data-theme="dark"] .api-content table td p { color: var(--color-orange); } + +html[data-theme="dark"] details table td { + color: inherit; +} + table td { vertical-align: top; } +html[data-theme=dark] main .row .col:first-of-type a:not(.button) { + color: var(--darkmode-link-color); +} + +html[data-theme="dark"] main .row .col:first-of-type a:hover { + filter: brightness(var(--filter-brightness-low)); +} + +html[data-theme="dark"] main .row .col:first-of-type a article * { + color: white; +} + html[data-theme="dark"] table td { - filter: brightness(1.25); color: white; } @@ -224,11 +247,7 @@ code { color: var(--ifm-color-emphasis-900); } -html[data-theme="dark"] a code { - color: var(--color-white); -} - -.main-wrapper .home .col > p { +.main-wrapper .home .col>p { font-size: 1.25rem; } @@ -322,6 +341,16 @@ a.code-link:hover { border: none; } +.navbarHiddenOnLoad { + transform: translate3d(0, calc(-100% - 2px), 0) +} + +@media(max-width: 996px) { + .navbar-sidebar .menu__link.nav-create-account { + width: fit-content; + } +} + .navbar-sidebar .menu__link.nav-create-account:hover { color: var(--color-white); background: var(--color-light-teal); @@ -428,7 +457,8 @@ a.navbar__item.navbar__link.btn:hover { background: var(--ifm-menu-color-background-hover) !important; } -.menu__caret, .menu__link { +.menu__caret, +.menu__link { display: flex; align-items: center; } @@ -486,14 +516,11 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ } /* color for subtext only */ -.theme-doc-sidebar-item-link - :not(.menu__list-item-collapsible .menu__link:first-of-type) { +.theme-doc-sidebar-item-link :not(.menu__list-item-collapsible .menu__link:first-of-type) { color: var(--color-nav-sub-level-text); } -[data-theme="dark"] - .theme-doc-sidebar-item-link - :not(.menu__list-item-collapsible .menu__link:first-of-type) { +[data-theme="dark"] .theme-doc-sidebar-item-link :not(.menu__list-item-collapsible .menu__link:first-of-type) { color: var(--color-white); } @@ -504,7 +531,6 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ /* set < icon to right side */ .menu__list-item-collapsible { flex-direction: row; - background-color: transparent; } .menu__list-item-collapsible button { @@ -524,6 +550,7 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ .docs-doc-page .theme-doc-toc-mobile .clean-btn:after { transform: rotate(90deg); } + .docs-doc-page .theme-doc-toc-mobile[class*="Expanded"] .clean-btn:after { transform: rotate(180deg); } @@ -537,10 +564,7 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ background-repeat: no-repeat; height: 9px; min-width: 6px; - width: calc( - var(--ifm-breadcrumb-separator-size) * var(--ifm-breadcrumb-size-multiplier) * - var(--ifm-breadcrumb-separator-size-multiplier) - ); + width: calc(var(--ifm-breadcrumb-separator-size) * var(--ifm-breadcrumb-size-multiplier) * var(--ifm-breadcrumb-separator-size-multiplier)); } /* Table of Contents */ @@ -612,18 +636,22 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ /* border: 1px solid transparent; */ border-radius: 5px; } + .alert h4 { margin-top: 0; color: inherit; } + .alert .alert-link { font-weight: inherit; } -.alert > p, -.alert > ul { + +.alert>p, +.alert>ul { margin-bottom: 0; } -.alert > p + p { + +.alert>p+p { margin-top: 5px; } @@ -632,28 +660,49 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ --ifm-alert-border-color: var(--ifm-color-info); color: var(--ifm-color-gray-900); } + .alert--success { --ifm-alert-background-color: var(--ifm-color-success-light); --ifm-alert-border-color: var(--ifm-color-success); color: var(--ifm-color-gray-900); } + .alert--danger { --ifm-alert-background-color: var(--ifm-color-danger-light); --ifm-alert-border-color: var(--ifm-color-danger); color: var(--ifm-color-gray-900); } + .alert--warning { --ifm-alert-background-color: var(--ifm-color-warning-light); --ifm-alert-border-color: var(--ifm-color-warning); color: var(--ifm-color-gray-900); } +.alert--secondary, +.alert--secondary a, +.alert--secondary svg { + --ifm-alert-background-color: #474748; + color: white !important; + fill: white !important; +} + +html[data-theme="dark"] .alert * { + --ifm-alert-foreground-color: var(--ifm-color-gray-900); +} + +html[data-theme="dark"] .alert table { + background-color: black; + color: white; + border-radius: var(--ifm-code-border-radius); +} + /* for dark mode */ .alert--info a, .alert--success a, .alert--danger a, .alert--warning a { - color: var(--ifm-color-gray-900); + color: var(--ifm-color-gray-900) !important; } .linkout { @@ -695,7 +744,7 @@ i.theme-doc-sidebar-item-category.theme-doc-sidebar-item-category-level-2.menu__ } } -.banner-animation svg #lines > * { +.banner-animation svg #lines>* { stroke-dasharray: 4px 6px; stroke-dashoffset: 60; @@ -812,6 +861,14 @@ div .toggle_src-components-faqs-styles-module { gap: 1em; } +html[data-theme="dark"] .pagination-nav a { + color: var(--darkmode-link-color); +} + +html[data-theme="dark"] .pagination-nav a:hover { + filter: brightness(var(--filter-brightness-low)); +} + .pagination-nav__link { padding: 1rem 0; transition: 100ms all ease-in-out; @@ -837,7 +894,7 @@ div .toggle_src-components-faqs-styles-module { /* Font Sizing Adjustments */ .markdown, -.markdown > p, +.markdown>p, .markdown li, blockquote, th, @@ -847,20 +904,26 @@ td, font-size: 1.125rem; line-height: 1.5; } + +.faqs>div { + margin: 1em 0; +} + .pagination-nav__link { font-size: 1rem; } + .title_src-components-file- { font-size: 1.125rem !important; } + .markdown h1:first-child { - font-size: clamp( - var(--ifm-h1-font-size) * 0.9375, - 1vw, - var(--ifm-h1-font-size) - ); + font-size: clamp(var(--ifm-h1-font-size) * 0.9375, + 1vw, + var(--ifm-h1-font-size)); margin-top: 0.5rem; } + .link_src-components-faqs-styles-module { font-size: 1.125rem; } @@ -873,14 +936,13 @@ td, max-width: 100%; } -.generatedIndexPage_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCategoryGeneratedIndexPage-styles-module - .cardContainer_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCard-styles-module, +.generatedIndexPage_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCategoryGeneratedIndexPage-styles-module .cardContainer_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCard-styles-module, .generatedIndexPage_aEAk .card { color: var(--color-white); background: var(--ifm-footer-background-color); } -.generatedIndexPage_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCategoryGeneratedIndexPage-styles-module - .cardContainer_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCard-styles-module:hover, + +.generatedIndexPage_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCategoryGeneratedIndexPage-styles-module .cardContainer_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocCard-styles-module:hover, .generatedIndexPage_aEAk .card:hover { color: var(--color-white); background: var(--color-primary-blue); @@ -913,8 +975,13 @@ html[data-theme="dark"] .blog-breadcrumbs a[href="#"] { filter: brightness(var(--filter-brightness-low)); } -html[data-theme="dark"] .blog-breadcrumbs a:not(:last-of-type):after { - color: var(--ifm-link-color); +html[data-theme="dark"] .blog-breadcrumbs a:hover { + filter: brightness(var(--filter-brightness-low)); +} + +html[data-theme="dark"] .blog-breadcrumbs a:not(:last-of-type):after, +html[data-theme="dark"] .blog-breadcrumbs a { + color: var(--darkmode-link-color); } html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { @@ -953,6 +1020,26 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { line-height: 1.25; } +/* BlogPostItem header h2 size */ +article[itemprop="blogPost"] h2 { + font-size: 2rem; +} + +html[data-theme="dark"] article[itemprop="blogPost"] a { + color: var(--darkmode-link-color); +} + +html[data-theme="dark"] article[itemprop="blogPost"] a:hover { + filter: brightness(var(--filter-brightness-low)); +} + +/* Sidebar Nav */ +html[data-theme="dark"] .main-wrapper nav a:hover, +html[data-theme="dark"] .main-wrapper nav a:active { + color: var(--darkmode-link-color) !important; + filter: brightness(var(--filter-brightness-low)); +} + /* footer styles */ .footer { font-weight: var(--ifm-font-weight-narrow); @@ -1013,7 +1100,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { /* copyright */ .footer__bottom { text-align: left; - color: var(--color-footer-accent); + color: white; font-size: 0.875rem; } @@ -1028,11 +1115,12 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { } @media (min-width: 997px) and (max-width: 2560px) { - .container > .row .col { + .container>.row>.col { padding-left: 2.5rem; } + /* for non row divs like on /category/adapter-development */ - .container > div:not(.row) { + .container>div:not(.row) { padding-left: 2.5rem; } } @@ -1056,13 +1144,12 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { } @media (max-width: 996px) { - .navbar-sidebar__brand - .navbarSidebarToggle_node_modules-\@docusaurus-theme-classic-lib-next-theme-Navbar-styles-module { + .navbar-sidebar__brand .navbarSidebarToggle_node_modules-\@docusaurus-theme-classic-lib-next-theme-Navbar-styles-module { margin-right: 0; margin-left: 0.25em; } - .navbar-sidebar__brand > div:first-of-type { + .navbar-sidebar__brand>div:first-of-type { order: 1; margin-right: 0; } @@ -1104,6 +1191,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { } @media screen and (min-width: 1200px) { + /* search bar styles */ button.DocSearch-Button { border: 1px solid #95a0b1; @@ -1162,7 +1250,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { max-width: inherit; } - html.docs-version-current .navbar__brand { + .dbt__brand { background-color: var(--ifm-background-color); border-right: 1px solid var(--ifm-toc-border-color); } @@ -1173,7 +1261,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { } @media (min-width: 997px) { - html.docs-version-current .navbar__brand { + .dbt__brand { padding: 0 var(--ifm-navbar-padding-horizontal) 0 0; min-height: var(--ifm-navbar-height); max-height: var(--ifm-navbar-height); @@ -1190,10 +1278,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { padding: 0 var(--ifm-navbar-padding-horizontal) 0 0; } - html.docs-version-current - aside - html.docs-version-current - .sidebar_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocSidebar-styles-module { + html.docs-version-current aside html.docs-version-current .sidebar_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocSidebar-styles-module { position: relative; } @@ -1211,12 +1296,11 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { align-self: stretch; } - .navbar__items--right > :last-child { - padding: var(--ifm-navbar-item-padding-vertical) 1em - var(--ifm-navbar-item-padding-vertical) 0.5em; + .navbar__items--right> :last-child { + padding: var(--ifm-navbar-item-padding-vertical) 1em var(--ifm-navbar-item-padding-vertical) 0.5em; } - .blog-main-row > main { + .blog-main-row>main { width: 100%; flex-grow: 1; max-width: calc(100% - var(--doc-sidebar-width)); @@ -1252,8 +1336,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { margin-bottom: 3rem; } -.blog-post-page - .docItemCol_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocItem-styles-module { +.blog-post-page .docItemCol_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocItem-styles-module { max-width: 70% !important; } @@ -1261,15 +1344,17 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { .blog-post-page .container { margin-top: 0 !important; } + .blog-main-row .blog-aside { margin: 0 !important; display: none; } + .blog-main-row .blog-right-sidebar { max-width: initial !important; } - .blog-post-page - .docItemCol_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocItem-styles-module { + + .blog-post-page .docItemCol_node_modules-\@docusaurus-theme-classic-lib-next-theme-DocItem-styles-module { max-width: 100% !important; } } @@ -1289,24 +1374,29 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { justify-content: space-between; flex-flow: row wrap; } + .blog-list-page .blog-index-posts-flex article { flex: 0 0 100%; max-width: 100%; margin: 1rem 0 !important; } + .blog-list-page .blog-index-posts-flex article h2 { font-size: 2rem; line-height: 37.5px; } + .blog-list-page .blog-index-posts-flex article footer.row .col--9 { flex: calc(8 / 12 * 100%); max-width: calc(8 / 12 * 100%); } + .blog-list-page .blog-index-posts-flex article footer.row .col--3 { flex: calc(4 / 12 * 100%); max-width: calc(4 / 12 * 100%); } -.blogPostTitle_src-theme-BlogPostItem-styles-module > img { + +.blogPostTitle_src-theme-BlogPostItem-styles-module>img { display: none; } @@ -1323,6 +1413,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { background-size: cover; background-position: center; } + @media (min-width: 997px) { .blog-hero { height: 500px; @@ -1332,29 +1423,37 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { .card.large.blog-hero-card:before { right: 350px; } + .card.large.blog-hero-card:after { right: 250px; } + .blog-hero-card { background: var(--ifm-footer-background-color); color: var(--color-white); } + .blog-hero-card .button { text-decoration: none; } + .blog-hero-card-content { z-index: 2; max-width: 75%; text-align: left; } + @media (max-width: 699px) { + .card.large.blog-hero-card:before, .card.large.blog-hero-card:after { display: none; } + .blog-hero-card { background: var(--color-primary-blue); } + .blog-hero-card-content { max-width: 100%; } @@ -1369,66 +1468,81 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { .author-header { align-items: center; } + .author-header-left, .author-header-right { padding: 20px; } + .author-header img { max-width: 100px; border-radius: 50%; display: block; margin: 0 auto; } + .author-header h1 { display: block; margin-bottom: 5px; } + .author-header .author-title { display: flex; align-items: center; margin-bottom: 10px; } + .author-header .author-links { display: flex; align-items: center; justify-content: center; } + .author-header .author-links span { padding-left: 10px; } + .author-header .author-links a { display: block; padding: 5px; } + .author-header .author-links a:first-of-type { padding-left: 10px; } + .author-header .author-links i { font-size: 1.1rem; } + .author-posts-section { margin-top: 2rem; } + .author-posts .author-post { flex: 0 0 100%; max-width: 100%; padding: 0 1rem 1rem; } + .author-posts .author-post img { max-width: 100%; width: 100%; display: block; margin: 0 auto; } + .author-posts .author-post h3 { margin: 10px 0; } + @media screen and (min-width: 700px) { .author-posts .author-post { flex: 0 0 50%; max-width: 50%; } } + @media (min-width: 997px) { .author-posts .author-post { flex: 0 0 33%; @@ -1453,13 +1567,16 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { bottom: 0; max-width: 400px; } + .docs-cta h4 { margin-bottom: 10px; } + .docs-cta p { font-size: 0.9rem; line-height: 21px; } + .docs-cta .docs-cta-btn { background: var(--ifm-color-primary); color: var(--color-white); @@ -1470,10 +1587,16 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { transition: 100ms all ease-in-out; white-space: nowrap; } + .docs-cta .docs-cta-btn:hover { background: var(--ifm-menu-color-active); } +/* CTA dark-mode styles */ +[data-theme='dark'] .docs-cta { + background: var(--ifm-navbar-background-color); +} + @media (max-width: 996px) { .docs-cta { display: none; @@ -1483,6 +1606,7 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { .mobile-toc-section { display: none; } + .mobile-toc-section .mobile-toc-container { padding: 0 var(--ifm-spacing-horizontal); } @@ -1491,28 +1615,30 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { .mobile-toc-section { display: block; } - .mobile-toc-section - .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module { + + .mobile-toc-section .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module { display: none; } - .mobile-toc-section - .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module.tocActive { + + .mobile-toc-section .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module.tocActive { display: block; } - .mobile-toc-section - .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module - .table-of-contents__left-border { + + .mobile-toc-section .tableOfContents_node_modules-\@docusaurus-theme-classic-lib-next-theme-TOC-styles-module .table-of-contents__left-border { border-left: none; } + #mobile-toc-dropdown button.tocActive:after { transform: rotate(0deg); } + #mobile-toc-dropdown { background-color: var(--ifm-menu-color-background-active); border-radius: var(--ifm-global-radius); margin: 1rem 0; } - #mobile-toc-dropdown > button { + + #mobile-toc-dropdown>button { font-size: inherit; display: flex; justify-content: space-between; @@ -1520,7 +1646,8 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { padding: 0.4rem 0.8rem; width: 100%; } - #mobile-toc-dropdown > button::after { + + #mobile-toc-dropdown>button::after { content: ""; background: var(--ifm-menu-link-sublist-icon) 50% 50% / 2rem 2rem no-repeat; filter: var(--ifm-menu-link-sublist-icon-filter); @@ -1529,18 +1656,21 @@ html[data-theme="dark"] .breadcrumbs__item--active .breadcrumbs__link { transform: rotate(180deg); transition: transform var(--ifm-transition-fast); } + #mobile-toc-dropdown button.tocActive::after { transform: rotate(0deg); } - #mobile-toc-dropdown > div.tocActive { + + #mobile-toc-dropdown>div.tocActive { display: block; } - #mobile-toc-dropdown > div > ul { + + #mobile-toc-dropdown>div>ul { border-left: none; } } -.docs-wrapper.docs-doc-page main div.row > article.col--6 { +.docs-wrapper.docs-doc-page main div.row>article.col--6 { margin-bottom: 1.5rem; } @@ -1559,6 +1689,7 @@ footer #ot-sdk-btn.optanon-show-settings { font-weight: 400; cursor: pointer; } + footer #ot-sdk-btn:hover, footer #ot-sdk-btn.optanon-show-settings:hover { background: none; @@ -1578,12 +1709,20 @@ section.discourse-forum-page { .home section { margin: 6.25rem auto 6.25rem; max-width: var(--ifm-container-width-xl); - padding: 0 var(--ifm-spacing-horizontal); + padding: 0 50px; } .home section.from-the-blog { margin-bottom: 0; } + +@media (max-width: 995px) { + .home section.from-the-blog { + padding-left: 0; + padding-right: 0; + } +} + .home section.from-the-community { margin-top: 0; @@ -1594,13 +1733,16 @@ section.discourse-forum-page { } @media (max-width: 995px) { - .home .resource-section h2 { - text-align: center; + .home .resource-section h2 { margin: 2rem inherit; } + + .home h2 { + text-align: center; + } } -section > h2:not(.resource-section) { +section>h2:not(.resource-section) { margin-bottom: 25px; } @@ -1615,12 +1757,26 @@ section > h2:not(.resource-section) { "popResources"; } +.home .resource-section.has-spotlight-member { + grid-template-areas: + "popH2" + "popResources" + "featH2" + "featResource"; +} + @media (min-width: 996px) { .home .resource-section { display: grid; grid-template-columns: 2fr 1fr; row-gap: 25px; - column-gap: calc(var(--ifm-spacing-horizontal) * 4); + column-gap: calc(var(--ifm-spacing-horizontal) * 2); + grid-template-areas: + "popH2 featH2" + "popResources featResource"; + } + + .home .resource-section.has-spotlight-member { grid-template-areas: "popH2 featH2" "popResources featResource"; @@ -1665,11 +1821,13 @@ section > h2:not(.resource-section) { height: 100%; } -.home .swiper .swiper-slide { +.home .swiper .swiper-slide, +.docswiper .swiper .swiper-slide { height: unset; } -.home .swiper.swiper-initialized { +.home .swiper.swiper-initialized, +.docswiper .swiper.swiper-initialized { padding-right: 3rem; padding-left: 3rem; padding-top: 0rem; @@ -1677,18 +1835,55 @@ section > h2:not(.resource-section) { margin-bottom: 0; } -.home .swiper .swiper-button-next, .home .swiper .swiper-button-prev { +.docswiper .swiper.swiper-initialized { + padding: 2rem 3rem; +} + +.home .swiper .swiper-button-next, +.home .swiper .swiper-button-prev { color: #047377; font-weight: 800; position: absolute; top: 40%; } -[data-theme='dark'] .home .swiper-button-next, [data-theme='dark'] .home .swiper-button-prev { +.docswiper .swiper .swiper-button-next, +.docswiper .swiper .swiper-button-prev { + color: #047377; + font-weight: 800; + position: absolute; + top: 0; + bottom: 0; + margin: auto 0; +} + + +[data-theme='dark'] .home .swiper-button-next, +[data-theme='dark'] .home .swiper-button-prev, +[data-theme='dark'] .docswiper .swiper-button-next, +[data-theme='dark'] .docswiper .swiper-button-prev { color: #fff; font-weight: 800; } +/* General Swiper Styles */ +.docswiper .swiper-pagination-bullet { + height: 10px; + width: 10px; +} + +.docswiper .swiper-pagination-bullet.swiper-pagination-bullet-active { + background: var(--ifm-color-info); +} + +[data-theme='dark'] .docswiper .swiper-pagination-bullet { + background: var(--color-off-white); +} + +[data-theme='dark'] .docswiper .swiper-pagination-bullet.swiper-pagination-bullet-active { + background: var(--color-light-teal); +} + /* Community Home styles */ .community-home section { margin: calc(5vh) auto calc(2vh); @@ -1696,8 +1891,53 @@ section > h2:not(.resource-section) { padding: 0 var(--ifm-spacing-horizontal); } +/* Report a Problem page styles */ +.report-a-problem { + margin: 0 auto; + max-width: var(--ifm-container-width-xl); + padding: 0 var(--ifm-spacing-horizontal); +} + /* utilities */ +.grid--5-col { + display: grid; + grid-template-columns: repeat(1, 1fr); + grid-gap: var(--ifm-spacing-horizontal) +} + +@media(min-width: 996px) { + .grid--5-col { + grid-template-columns: repeat(2, 1fr); + grid-gap: calc(var(--ifm-spacing-horizontal) * 2); + } + @media(min-width: 1500px) { + .grid--5-col { + grid-template-columns: repeat(5, 1fr); + grid-gap: calc(var(--ifm-spacing-horizontal) * 2); + } + } +} + +.grid--4-col { + display: grid; + grid-template-columns: repeat(1, 1fr); + grid-gap: var(--ifm-spacing-horizontal) +} + +@media(min-width: 996px) { + .grid--4-col { + grid-template-columns: repeat(2, 1fr); + grid-gap: calc(var(--ifm-spacing-horizontal) * 2); + } + @media(min-width: 1500px) { + .grid--4-col { + grid-template-columns: repeat(4, 1fr); + grid-gap: calc(var(--ifm-spacing-horizontal) * 2); + } + } +} + .grid--3-col { display: grid; grid-template-columns: repeat(1, 1fr); @@ -1727,3 +1967,75 @@ section > h2:not(.resource-section) { .justify-content-center { justify-content: center !important; } + +/* Community Spotlight Page */ +#spotlight-members-section { + padding: 3rem 0; +} + +.community-spotlight-hero p { + font-size: 1.25rem; +} + +/* Sidebar collapse button */ +.theme-doc-sidebar-container>div>button.button { + background-color: var(--ifm-font-color-base-inverse); +} + +.theme-doc-sidebar-container>div>button.button:hover { + background-color: var(--color-off-white); +} + +html[data-theme="dark"] .theme-doc-sidebar-container>div>button.button { + background-color: #1C1C1D; +} + +html[data-theme="dark"] .theme-doc-sidebar-container>div>button.button:hover { + background-color: #383839; +} + +.quickstart-container { + display: flex; + position: relative; + width: 100%; +} + +.quickstart-container.hidden { + display: none; +} + +.quickstart-container .step-container { + display: flex; + flex-direction: column; + width: 100%; +} + +.quickstart-container .step-container .intro { + order: -1; +} + +.quickstart-container .step-container .intro.hidden { + display: none; +} + +.quickstart-container .step-wrapper.hidden { + display: none; +} + +.card-container { + position: relative; +} + +.external-link { + position: absolute; + top: 0; + right: 0; + margin: 10px; + color: #818589; /* You can adjust the color as needed */ +} + +@media (max-width: 996px) { + .quickstart-container { + flex-direction: column; + } +} diff --git a/website/src/pages/community/forum.js b/website/src/pages/community/forum.js index 346d96e9adf..c3fde1a681d 100644 --- a/website/src/pages/community/forum.js +++ b/website/src/pages/community/forum.js @@ -7,7 +7,7 @@ function Events() { return ( - dbt Community Forum + Questions | dbt Developer Hub
          diff --git a/website/src/pages/dbt-cloud/api-v2-legacy.js b/website/src/pages/dbt-cloud/api-v2-legacy.js new file mode 100644 index 00000000000..37b1854f811 --- /dev/null +++ b/website/src/pages/dbt-cloud/api-v2-legacy.js @@ -0,0 +1,33 @@ +import React from "react"; +import Layout from "@theme/Layout"; + +import { RedocStandalone } from "redoc"; + +function dbtCloudAPI() { + return ( + + + + ); +} + +export default dbtCloudAPI; diff --git a/website/src/pages/dbt-cloud/api-v2.js b/website/src/pages/dbt-cloud/api-v2.js index 7f5ff81287d..f1ab333e89e 100644 --- a/website/src/pages/dbt-cloud/api-v2.js +++ b/website/src/pages/dbt-cloud/api-v2.js @@ -1,37 +1,22 @@ -import React from 'react'; -import Link from '@docusaurus/Link'; -import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; -import useBaseUrl from '@docusaurus/useBaseUrl'; -import Layout from '@theme/Layout'; -import classnames from 'classnames'; +import React, { Suspense } from "react"; +import Layout from "@theme/Layout"; +import BrowserOnly from "@docusaurus/BrowserOnly"; -import { RedocStandalone } from 'redoc'; +const LazyStoplight = React.lazy(() => import("../../components/stoplight")); +const Fallback = ( +
          +); function dbtCloudAPI() { - const context = useDocusaurusContext(); - return ( - + + {() => ( + + + + )} + ); } diff --git a/website/src/pages/dbt-cloud/api-v3.js b/website/src/pages/dbt-cloud/api-v3.js new file mode 100644 index 00000000000..bb5db5f4f63 --- /dev/null +++ b/website/src/pages/dbt-cloud/api-v3.js @@ -0,0 +1,24 @@ +import React, { Suspense } from "react"; +import Layout from "@theme/Layout"; +import BrowserOnly from "@docusaurus/BrowserOnly"; + +const LazyStoplight = React.lazy(() => import("../../components/stoplight")); +const Fallback = ( +
          +); + +function dbtCloudAPI() { + return ( + + + {() => ( + + + + )} + + + ); +} + +export default dbtCloudAPI; diff --git a/website/src/pages/dbt-cloud/api-v4.js b/website/src/pages/dbt-cloud/api-v4.js deleted file mode 100644 index 0cb9324af71..00000000000 --- a/website/src/pages/dbt-cloud/api-v4.js +++ /dev/null @@ -1,39 +0,0 @@ -import React from 'react'; -import Link from '@docusaurus/Link'; -import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; -import useBaseUrl from '@docusaurus/useBaseUrl'; -import Layout from '@theme/Layout'; -import classnames from 'classnames'; - -import { RedocStandalone } from 'redoc'; - -function dbtCloudAPI() { - const context = useDocusaurusContext(); - - return ( - - - - ); -} - -export default dbtCloudAPI; diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 5c9de9e85b2..b5b3957e9e4 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -7,14 +7,15 @@ import BlogPostCard from '@site/src/components/blogPostCard'; import Hero from '@site/src/components/hero'; import PostCarousel from '@site/src/components/postCarousel'; import allBlogData from './../../.docusaurus/docusaurus-plugin-content-blog/default/blog-archive-80c.json' - +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import { getSpotlightMember } from '../utils/get-spotlight-member'; const bannerAnimation = require('@site/static/img/banner-white.svg'); function getBanner() { return { __html: bannerAnimation }; -}; +} function Home() { @@ -34,8 +35,25 @@ function Home() { title: "How we structure our dbt projects", description: "Our hands-on learnings for how to structure your dbt project for success and gain insights into the principles of analytics engineering.", link: "/guides/best-practices/how-we-structure/1-guide-overview", - image: "/img/structure-dbt-projects.png" + image: "/img/structure-dbt-projects.png", + sectionTitle: 'Featured resource' } + + // Set spotlightSection to featuredResource by default + let spotlightSection = featuredResource + + // Check if featured community spotlight member set in Docusaurus config + const { siteConfig } = useDocusaurusContext() + let communitySpotlightMember = siteConfig?.themeConfig?.communitySpotlightMember || null + + // Get spotlight member by ID or date if available + // If found, update section to show community spotlight member + // Otherwise, show featured resource + const spotlightMember = getSpotlightMember(communitySpotlightMember) + if(spotlightMember) { + spotlightSection = spotlightMember + } + return ( <> @@ -44,7 +62,7 @@ function Home() {
          -
          +

          Popular resources

          @@ -60,7 +78,7 @@ function Home() {
          @@ -82,9 +100,9 @@ function Home() {
          -

          Featured resource

          +

          {spotlightSection?.sectionTitle ? spotlightSection.sectionTitle : 'Featured resource'}

          - +
          diff --git a/website/src/pages/report-a-problem.js b/website/src/pages/report-a-problem.js new file mode 100644 index 00000000000..c12b394e45f --- /dev/null +++ b/website/src/pages/report-a-problem.js @@ -0,0 +1,35 @@ +import React from "react"; +import Layout from "@theme/Layout"; +import Hero from "@site/src/components/hero"; +import { HubSpotForm } from "@site/src/components/hubspotForm"; + +function ReportAProblem() { + return ( + <> + +
          + +
          +
          + +
          +
          +
          +
          + + ); +} + +export default ReportAProblem; diff --git a/website/src/pages/styles.js b/website/src/pages/styles.js index 3f326104633..d0bcf2f227c 100644 --- a/website/src/pages/styles.js +++ b/website/src/pages/styles.js @@ -1,23 +1,13 @@ import React from 'react'; -import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; -import useBaseUrl from '@docusaurus/useBaseUrl'; import Layout from '@theme/Layout'; -import classnames from 'classnames'; - -import {MDXProvider} from '@mdx-js/react'; -import MDXComponents from '@theme/MDXComponents'; import CodeBlock from '@theme/CodeBlock'; - import Changelog from '@site/src/components/changelog'; import CloudCore from '@site/src/components/cloudcore'; -import WHCode from '@site/src/components/whcode'; import Collapsible from '@site/src/components/collapsible'; -import FAQList from '@site/src/components/faqList'; import FAQ from '@site/src/components/faqs'; import File from '@site/src/components/file'; import Lightbox from '@site/src/components/lightbox'; -import Link from '@site/src/components/link'; import LoomVideo from '@site/src/components/loom'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; @@ -105,9 +95,9 @@ description: "this is \[an escaped link](docs.getdbt.com)"

          FAQ

          -
          {``}
          - - +
          {``}
          + +
          @@ -136,7 +126,7 @@ password: hunter2

          Markdown Links

          - Refer to the Links section of the Content Style Guide to read about how you can use links in the dbt product documentation. + Refer to the Links section of the Content Style Guide to read about how you can use links in the dbt product documentation.
          diff --git a/website/src/stores/VersionContext.js b/website/src/stores/VersionContext.js index b84b7d7a05b..835954145d3 100644 --- a/website/src/stores/VersionContext.js +++ b/website/src/stores/VersionContext.js @@ -8,7 +8,7 @@ const VersionContext = createContext({ EOLDate: lastReleasedVersion.EOLDate || undefined, isPrerelease: lastReleasedVersion.isPrerelease || false, latestStableRelease: lastReleasedVersion.version, - updateVersion: () => {}, + updateVersion: () => Object, }) export const VersionContextProvider = ({ value = "", children }) => { @@ -46,8 +46,10 @@ export const VersionContextProvider = ({ value = "", children }) => { const updateVersion = (e) => { if(!e.target) return - - const versionValue = e.target.text.replace('v', '') + + const vRegex = /(?:v)?(\d+(\.\d+)*)/ // Regex that will parse out the version number, even if there is/isn't a 'v' in front of version number and a '(Beta)' afterwards. + const versionValue = e.target.text.match(vRegex)[1] + versionValue && setVersion(versionValue) window.localStorage.setItem('dbtVersion', versionValue) diff --git a/website/src/theme/AnnouncementBar/index.js b/website/src/theme/AnnouncementBar/index.js index f82645c9b1c..75d19ca9267 100644 --- a/website/src/theme/AnnouncementBar/index.js +++ b/website/src/theme/AnnouncementBar/index.js @@ -1,32 +1,25 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; -import clsx from 'clsx'; -import {useThemeConfig, useAnnouncementBar} from '@docusaurus/theme-common'; -import {translate} from '@docusaurus/Translate'; -import IconClose from '@theme/IconClose'; +import {useThemeConfig} from '@docusaurus/theme-common'; +import {useAnnouncementBar} from '@docusaurus/theme-common/internal'; +import AnnouncementBarCloseButton from '@theme/AnnouncementBar/CloseButton'; +import AnnouncementBarContent from '@theme/AnnouncementBar/Content'; import styles from './styles.module.css'; + +/* dbt Customizations: + * Wrap entire AnnouncementBar in link to make whole banner clickable +*/ + export default function AnnouncementBar() { - const {isActive, close} = useAnnouncementBar(); const {announcementBar, announcementBarActive, announcementBarLink} = useThemeConfig(); - + const {isActive, close} = useAnnouncementBar(); if (!isActive || !announcementBarActive) { return null; } - - const {content, backgroundColor, textColor, isCloseable} = announcementBar; - + const {backgroundColor, textColor, isCloseable} = announcementBar; return (
          {isCloseable &&
          } {announcementBarLink ? ( @@ -40,37 +33,17 @@ export default function AnnouncementBar() { color: textColor, }} > - + ) : ( - + )} - - {isCloseable ? ( - - ) : null} + className={styles.announcementBarClose} + /> + )}
          ); } - -function AnnouncementBarContent({ content, styles }) { - return ( -
          - ) -} diff --git a/website/src/theme/AnnouncementBar/styles.module.css b/website/src/theme/AnnouncementBar/styles.module.css index e76f981dfbb..1d3f7ba390b 100644 --- a/website/src/theme/AnnouncementBar/styles.module.css +++ b/website/src/theme/AnnouncementBar/styles.module.css @@ -1,10 +1,3 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - :root { --docusaurus-announcement-bar-height: auto; } @@ -13,6 +6,15 @@ display: flex; align-items: center; height: auto; + + /* + Unfortunately we can't make announcement bar render above the navbar + IE need to use border-bottom instead of shadow + See https://github.com/facebookincubator/infima/issues/275 + + box-shadow: var(--ifm-global-shadow-lw); + z-index: calc(var(--ifm-z-index-fixed) + 1); + */ } html[data-announcement-bar-initially-dismissed='true'] .announcementBar { @@ -26,8 +28,6 @@ html[data-announcement-bar-initially-dismissed='true'] .announcementBar { .announcementBarClose { flex: 0 0 30px; align-self: stretch; - padding: 0; - line-height: 0; color: var(--ifm-color-white); opacity: 1; } @@ -45,11 +45,6 @@ html[data-announcement-bar-initially-dismissed='true'] .announcementBar { } } -.announcementBarContent a { - color: inherit; - text-decoration: underline; -} - @media (min-width: 997px) { .announcementBarPlaceholder, .announcementBarClose { @@ -61,6 +56,7 @@ html[data-announcement-bar-initially-dismissed='true'] .announcementBar { color: var(--ifm-color-white); width: 100%; } + .announcementBarLink:hover { color: var(--ifm-color-white); text-decoration: none; diff --git a/website/src/theme/BlogLayout/index.js b/website/src/theme/BlogLayout/index.js index 5905a60f6fe..6ecf1ccbd23 100644 --- a/website/src/theme/BlogLayout/index.js +++ b/website/src/theme/BlogLayout/index.js @@ -1,62 +1,75 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - import React from 'react'; - import clsx from 'clsx'; - import Layout from '@theme/Layout'; - import BlogSidebar from '@theme/BlogSidebar'; - import TOC from '@theme/TOC'; - - // dbt Custom - import Head from '@docusaurus/Head'; - import Link from '@docusaurus/Link'; - import {usePluginData} from '@docusaurus/useGlobalData'; - - function BlogLayout(props) { - const {title, description, blogPageTitle, sidebar, toc, children, ...layoutProps} = props; - - // dbt Custom - const { blogMeta, tagData } = usePluginData('docusaurus-build-global-data-plugin'); - const { - featured_image, - featured_cta, - show_title, - show_description, - hero_button_url, - hero_button_text, - hero_button_new_tab, - show_left_sidebar - } = blogMeta +import React from 'react'; +import clsx from 'clsx'; +import Layout from '@theme/Layout'; +import BlogSidebar from '@theme/BlogSidebar'; - const hasSidebar = layoutProps.pageClassName === "blog-post-page" - ? show_left_sidebar && (sidebar && sidebar.items.length > 0) - : sidebar && sidebar.items.length > 0 +/* dbt Customizations: + * Import Head, Link, plugin & context hooks + * Sets states for checking if blog post or blog list page + * Get blogMeta and tagData global data from plugin + * Get blogData from docusaurus.config.js + * Set custom breadcrumb and meta titles + * Set custom featured image from website/blog/metadata.yml + * Show hero card section if blog list page + * Adds breadcrumbs section + * Hide sidebar if blog post page and adjusts column sizing +*/ +import Head from '@docusaurus/Head'; +import Link from '@docusaurus/Link'; +import {usePluginData} from '@docusaurus/useGlobalData'; +import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; - // The pageTitle variable sets the final item in breadcrumbs - let breadcrumbTitle = undefined - if(blogPageTitle) { - // Set to blogPageTitle prop - breadcrumbTitle = blogPageTitle - } else if(title) { - // Set to title prop - breadcrumbTitle = title - } else if (layoutProps.pageClassName === "blog-post-page") { - // Set to blog post title - const { props: { frontMatter }} = children.find(child => child.props.frontMatter) - breadcrumbTitle = frontMatter.title - } +export default function BlogLayout(props) { + const { + sidebar, + toc, + children, + title, + description, + isBlogList, + isBlogPost, + ...layoutProps + } = props; + const hasSidebar = sidebar && sidebar.items.length > 0; - const metaTitle = layoutProps.pageClassName !== "blog-list-page" - ? `${breadcrumbTitle} | dbt Developer Blog` - : breadcrumbTitle + const { blogMeta, tagData } = usePluginData('docusaurus-build-global-data-plugin'); + const { siteConfig: { presets } } = useDocusaurusContext() + // Get blog data from docusaurus config + const blogData = presets && presets.reduce((acc, preset) => { + const context = preset?.find(item => item['blog']) + if(context) acc = context['blog'] + return acc + }, {}) - return ( - - - {/* Set Custom Metadata */} + const { + featured_image, + show_title, + show_description, + hero_button_url, + hero_button_text, + hero_button_new_tab, + } = blogMeta + + // The pageTitle variable sets the final item in breadcrumbs + let breadcrumbTitle = undefined + if(title) { + // Set to title prop + breadcrumbTitle = title + } + + // Set custom meta title for blog list and blog post pages + let metaTitle = undefined + if(isBlogList) { + metaTitle = blogData.blogTitle + } else if(breadcrumbTitle) { + metaTitle = `${breadcrumbTitle} | dbt Developer Blog` + } + // end dbtCustom + + return ( + + + {/* Set Custom Metadata */} {featured_image && featured_image !== "" && @@ -78,22 +91,15 @@ } {/* End Custom Metadata */} - - {featured_image && featured_image !== "" && - -
          - - } - - {layoutProps.pageClassName && layoutProps.pageClassName === "blog-list-page" && - ((show_title || show_description) && (title || description)) && ( + + {isBlogList && + ((show_title || show_description) && (blogData?.blogTitle || blogData?.blogDescription)) && (
          - {title && show_title &&

          {title}

          } - {description && show_description &&

          {description}

          } - {blogMeta.test} + {blogData.blogTitle && show_title &&

          {blogData.blogTitle}

          } + {blogData.blogDescription && show_description &&

          {blogData.blogDescription}

          } {(hero_button_text !== "" && hero_button_text !== "") && ( hero_button_new_tab ? ( {hero_button_text} ) : ( dbt Docs Developer Blog - {(layoutProps.pageClassName !== "blog-list-page" && breadcrumbTitle) && + {(!isBlogList && breadcrumbTitle) && {breadcrumbTitle} }
          - -
          - {toc && ( -
          -
          - - -
          -
          - )} -
          - -
          - {hasSidebar && ( - - )} -
          -
          -
          -
          - {children} -
          - {layoutProps.pageClassName && -
          - {toc && ( - - )} -
          - } -
          -
          -
          -
          - - ); - } - - // Show or hide table of contents for mobile - function handleTocClick(e) { - const tocButton = document.querySelector('#mobile-toc-dropdown > button') - const toc = document.querySelector('#mobile-toc-dropdown > div') - - if(toc.classList.contains('tocActive')) { - toc.classList.remove('tocActive') - tocButton.classList.remove('tocActive') - } else { - toc.classList.add('tocActive') - tocButton.classList.add('tocActive') - } - } - - export default BlogLayout; - \ No newline at end of file + +
          +
          + {!isBlogPost && } +
          + {children} +
          + {toc &&
          {toc}
          } +
          +
          + + ); +} diff --git a/website/src/theme/BlogListPage/index.js b/website/src/theme/BlogListPage/index.js index 627f08abdbf..674825921c2 100644 --- a/website/src/theme/BlogListPage/index.js +++ b/website/src/theme/BlogListPage/index.js @@ -1,115 +1,54 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; -import Link from '@docusaurus/Link'; +import clsx from 'clsx'; import useDocusaurusContext from '@docusaurus/useDocusaurusContext'; +import { + PageMetadata, + HtmlClassNameProvider, + ThemeClassNames, +} from '@docusaurus/theme-common'; import BlogLayout from '@theme/BlogLayout'; -import BlogPostItem from '@theme/BlogPostItem'; import BlogListPaginator from '@theme/BlogListPaginator'; -import {ThemeClassNames} from '@docusaurus/theme-common'; -import {usePluginData} from '@docusaurus/useGlobalData'; +import SearchMetadata from '@theme/SearchMetadata'; +import BlogPostItems from '@theme/BlogPostItems'; -function BlogListPage(props) { - const {metadata, items, sidebar} = props; +/* dbt Customizations: + * Send custom isBlogList prop to BlogLayout + * This determines whether to show the Blog header or not +*/ + +function BlogListPageMetadata(props) { + const {metadata} = props; const { siteConfig: {title: siteTitle}, } = useDocusaurusContext(); const {blogDescription, blogTitle, permalink} = metadata; const isBlogOnlyMode = permalink === '/'; const title = isBlogOnlyMode ? siteTitle : blogTitle; - - // dbt Custom - const { - blogMeta, - tagData - } = usePluginData('docusaurus-build-global-data-plugin'); - const { - featured_posts_count, - regular_posts_count - } = blogMeta - - // Sort posts by date then title - const handlePostsSort = (a, b) => { - const { date: a_date, title: a_title } = a.content.frontMatter - const { date: b_date, title: b_title } = b.content.frontMatter - return b_date - a_date || a_title.localeCompare(b_title) - } - - // Set Featured Posts - const featuredPosts = items - .filter(post => post.content.frontMatter.is_featured) - .slice(0, featured_posts_count ? featured_posts_count : 2) - - // Get all non-featured posts - let allOtherPosts = items - .filter(post => !post.content.frontMatter.is_featured) - - // Get all featured posts - // which aren't included in featured posts section - const allOtherFeaturedPosts = items - .filter(post => post.content.frontMatter.is_featured) - .slice(featured_posts_count ? featured_posts_count : 2) - - // Group together all posts not featured - allOtherPosts = allOtherPosts.concat(allOtherFeaturedPosts) - return ( - - - {/* Featured Posts */} -
          -

          Featured Posts

          -
          - {featuredPosts - .sort(handlePostsSort) - .map(({content: BlogPostContent}) => ( - - - - ))} -
          -
          - -
          -

          Recent Posts

          -
          - {allOtherPosts - .sort(handlePostsSort) - .slice(0, regular_posts_count ? regular_posts_count : 15) - .map(({content: BlogPostContent}) => ( - - - - ))} -
          -
          - - {/* */} + <> + + + + ); +} +function BlogListPageContent(props) { + const {metadata, items, sidebar} = props; + return ( + + + ); } - -export default BlogListPage; +export default function BlogListPage(props) { + return ( + + + + + ); +} diff --git a/website/src/theme/BlogPostAuthor/index.js b/website/src/theme/BlogPostAuthor/index.js deleted file mode 100644 index ccdeeb56314..00000000000 --- a/website/src/theme/BlogPostAuthor/index.js +++ /dev/null @@ -1,46 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React from 'react'; -import Link from '@docusaurus/Link'; -import styles from './styles.module.css'; - -function BlogPostAuthor({author}) { - const {name, job_title, organization, url, imageURL, key} = author; - return ( -
          - {imageURL && ( - - {name} - - )} - - { - // Note: only legacy author frontmatter allow empty name (not frontMatter.authors) - name && ( - - ) - } -
          - ); -} - -export default BlogPostAuthor; diff --git a/website/src/theme/BlogPostAuthor/styles.module.css b/website/src/theme/BlogPostAuthor/styles.module.css deleted file mode 100644 index aae60add2db..00000000000 --- a/website/src/theme/BlogPostAuthor/styles.module.css +++ /dev/null @@ -1,12 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -.image { - width: 100%; - height: 100%; - object-fit: cover; -} diff --git a/website/src/theme/BlogPostItem/Header/Author/index.js b/website/src/theme/BlogPostItem/Header/Author/index.js new file mode 100644 index 00000000000..f82428df789 --- /dev/null +++ b/website/src/theme/BlogPostItem/Header/Author/index.js @@ -0,0 +1,49 @@ +import React from 'react'; +import clsx from 'clsx'; +import Link from '@docusaurus/Link'; +import imageCacheWrapper from '../../../../../functions/image-cache-wrapper'; +function MaybeLink(props) { + if (props.href || props.slug) { + return ; + } + return <>{props.children}; +} + +/* dbt Customizations: + * Gets 'key' from author props + * Does not use 'MaybeLink', instead + * always uses link and sets url to /author/{key} +*/ + +export default function BlogPostItemHeaderAuthor({author, className}) { + const {name, url, imageURL, email, key, job_title, organization} = author; + const link = url || (email && `mailto:${email}`) || undefined; + return ( +
          + {imageURL && ( + + {name} + + )} + + {name && ( + + )} +
          + ); +} diff --git a/website/src/theme/BlogPostItem/index.js b/website/src/theme/BlogPostItem/index.js deleted file mode 100644 index 071319aae31..00000000000 --- a/website/src/theme/BlogPostItem/index.js +++ /dev/null @@ -1,203 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -/** - * Swizzled on 10-15-21 - * - * Custom dbtLabs Changes: - * - Add image above title for blog posts - */ - -import React, { useEffect } from 'react'; -import clsx from 'clsx'; -import {MDXProvider} from '@mdx-js/react'; -import Translate, {translate} from '@docusaurus/Translate'; -import Head from '@docusaurus/Head'; -import Link from '@docusaurus/Link'; -import {useBaseUrlUtils} from '@docusaurus/useBaseUrl'; -import {usePluralForm} from '@docusaurus/theme-common'; -import MDXComponents from '@theme/MDXComponents'; -import EditThisPage from '@theme/EditThisPage'; -import styles from './styles.module.css'; -import TagsListInline from '@theme/TagsListInline'; -import BlogPostAuthors from '@theme/BlogPostAuthors'; // Very simple pluralization: probably good enough for now - -function useReadingTimePlural() { - const {selectMessage} = usePluralForm(); - return (readingTimeFloat) => { - const readingTime = Math.ceil(readingTimeFloat); - return selectMessage( - readingTime, - translate( - { - id: 'theme.blog.post.readingTime.plurals', - description: - 'Pluralized label for "{readingTime} min read". Use as much plural forms (separated by "|") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)', - message: 'One min read|{readingTime} min read', - }, - { - readingTime, - }, - ), - ); - }; -} - -function BlogPostItem(props) { - const readingTimePlural = useReadingTimePlural(); - const {withBaseUrl} = useBaseUrlUtils(); - const { - children, - frontMatter, - assets, - metadata, - truncated, - isBlogPostPage = false, - } = props; - const { - date, - formattedDate, - permalink, - tags, - readingTime, - title, - editUrl, - authors, - } = metadata; - const image = assets.image ?? frontMatter.image ?? '/img/avatar.png'; - - const renderPostHeader = () => { - const TitleHeading = isBlogPostPage ? 'h1' : 'h2'; - return ( -
          - - - {isBlogPostPage ? ( - title - ) : ( - - {title} - - )} - -
          - - - {typeof readingTime !== 'undefined' && ( - <> - {' · '} - {readingTimePlural(readingTime)} - - )} -
          - -
          - ); - }; - - // dbt custom - send blog context to datalayer to send to snowplow - useEffect(() => { - let blogContext = { - event: 'blogContext', - blogAuthor: '', - blogCategory: '', - blogDate: formattedDate ? formattedDate : undefined - } - - if(authors && authors.length > 0) { - authors.map((author, i) => { - blogContext.blogAuthor += - `${author.name}${i !== authors.length - 1 ? ', ' : ''}` - }) - } - - if(tags && tags.length > 0) { - tags.map((tag, i) => { - blogContext.blogCategory += - `${tag.label}${i !== tags.length - 1 ? ', ' : ''}` - }) - } - - // Only send to datalayer if blog post page - if(isBlogPostPage) { - window.dataLayer = window.dataLayer || []; - dataLayer && dataLayer.push(blogContext) - } - }, []) - - return ( - <> - {frontMatter.canonical_url && ( - - - - )} -
          - {renderPostHeader()} - - {image && ( - - )} - -
          - {children} -
          - - {(tags.length > 0 || truncated) && ( -
          - {tags.length > 0 && ( -
          - -
          - )} - - {isBlogPostPage && editUrl && ( -
          - -
          - )} - - {!isBlogPostPage && truncated && ( -
          - - - - Read More - - - -
          - )} -
          - )} -
          - - ); -} - -export default BlogPostItem; diff --git a/website/src/theme/BlogPostItem/styles.module.css b/website/src/theme/BlogPostItem/styles.module.css deleted file mode 100644 index 1acc4921d9d..00000000000 --- a/website/src/theme/BlogPostItem/styles.module.css +++ /dev/null @@ -1,14 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -.blogPostData { - font-size: 0.9rem; -} - -.blogPostDetailsFull { - flex-direction: column; -} diff --git a/website/src/theme/BlogPostPage/index.js b/website/src/theme/BlogPostPage/index.js index 58b7dea63e4..9569df4d12b 100644 --- a/website/src/theme/BlogPostPage/index.js +++ b/website/src/theme/BlogPostPage/index.js @@ -1,88 +1,119 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - import React from 'react'; - import Seo from '@theme/Seo'; - import Head from '@docusaurus/Head'; - import BlogLayout from '@theme/BlogLayout'; - import BlogPostItem from '@theme/BlogPostItem'; - import BlogPostPaginator from '@theme/BlogPostPaginator'; - import {ThemeClassNames} from '@docusaurus/theme-common'; - - function BlogPostPage(props) { - const {content: BlogPostContents, sidebar} = props; - const {frontMatter, assets, metadata} = BlogPostContents; - const { - title, - description, - nextItem, - prevItem, - date, - tags, - authors, - } = metadata; - const {hide_table_of_contents: hideTableOfContents, keywords} = frontMatter; - const image = assets.image ?? frontMatter.image; - return ( - - - - - - {/* TODO double check those article metas array syntaxes, see https://ogp.me/#array */} - {authors.some((author) => author.url) && ( - author.url) - .filter(Boolean) - .join(',')} - /> - )} - {tags.length > 0 && ( - tag.label).join(',')} - /> - )} - - - {/* dbt Custom */} - - {title} | dbt Developer Blog - - - {/* End dbt Custom */} - - - - - - {(nextItem || prevItem) && ( - - )} - - ); - } - - export default BlogPostPage; +import React, { useEffect } from 'react'; +import clsx from 'clsx'; +import {HtmlClassNameProvider, ThemeClassNames} from '@docusaurus/theme-common'; +import {BlogPostProvider, useBlogPost} from '@docusaurus/theme-common/internal'; +import BlogLayout from '@theme/BlogLayout'; +import BlogPostItem from '@theme/BlogPostItem'; +import BlogPostPaginator from '@theme/BlogPostPaginator'; +import BlogPostPageMetadata from '@theme/BlogPostPage/Metadata'; +import TOC from '@theme/TOC'; +import TOCCollapsible from '@theme/TOCCollapsible'; +import styles from './styles.module.css'; +import { DiscourseBlogComments } from '@site/src/components/discourseBlogComments'; + +/* dbt Customizations: + * Import global data from plugin + * Import TOCCollapsible and custom styles + * Gets authors, tags, formattedDate from metadata + * Send authors, tags, formattedDate to snowplow + * Passes post title prop BlogLayout to display in breadcrumbs + * Get featured_cta from global data and pass to TOC +*/ +import {usePluginData} from '@docusaurus/useGlobalData'; + +function BlogPostPageContent({sidebar, children}) { + const {metadata, toc} = useBlogPost(); + const {nextItem, prevItem, frontMatter, authors, tags, formattedDate} = metadata; + const { + hide_table_of_contents: hideTableOfContents, + toc_min_heading_level: tocMinHeadingLevel, + toc_max_heading_level: tocMaxHeadingLevel + } = frontMatter; + + // dbt Custom - send blog post context to datalayer to send to snowplow + useEffect(() => { + let blogContext = { + event: 'blogContext', + blogAuthor: '', + blogCategory: '', + blogDate: formattedDate ? formattedDate : undefined + } + + if(authors && authors.length > 0) { + authors.map((author, i) => { + author?.name && ( + blogContext.blogAuthor += + `${author.name}${i !== authors.length - 1 ? ', ' : ''}` + ) + }) + } + + if(tags && tags.length > 0) { + tags.map((tag, i) => { + tag?.label && ( + blogContext.blogCategory += + `${tag.label}${i !== tags.length - 1 ? ', ' : ''}` + ) + }) + } + + window.dataLayer = window.dataLayer || []; + window.dataLayer && window.dataLayer.push(blogContext) + }, []) + + const { blogMeta } = usePluginData('docusaurus-build-global-data-plugin'); + const { featured_cta } = blogMeta + + return ( + 0 ? ( + + ) : undefined } + isBlogPost={true} + > + + {!hideTableOfContents && toc.length > 0 && ( + + )} + + {children} + + + + {(nextItem || prevItem) && ( + + )} + + ); +} +export default function BlogPostPage(props) { + const BlogPostContent = props.content; + return ( + + + + + + + + + ); +} diff --git a/website/src/theme/BlogPostPage/styles.module.css b/website/src/theme/BlogPostPage/styles.module.css new file mode 100644 index 00000000000..9e4ea1f5752 --- /dev/null +++ b/website/src/theme/BlogPostPage/styles.module.css @@ -0,0 +1,12 @@ +/* dbt custom */ +@media (min-width: 997px) { + .tocMobile { + display: none; + } +} + +@media print { + .tocMobile { + display: none; + } +} diff --git a/website/src/theme/BlogSidebar/Desktop/index.js b/website/src/theme/BlogSidebar/Desktop/index.js new file mode 100644 index 00000000000..322bb99dd72 --- /dev/null +++ b/website/src/theme/BlogSidebar/Desktop/index.js @@ -0,0 +1,66 @@ +import React from 'react'; +import clsx from 'clsx'; +import Link from '@docusaurus/Link'; +import {translate} from '@docusaurus/Translate'; +import styles from './styles.module.css'; + +/* dbt Customizations: + * Brings in tagData prop + * If tagData available, add 'Categories' section to sidebar +*/ + +export default function BlogSidebarDesktop({sidebar, tagData}) { + return ( + + ); +} diff --git a/website/src/theme/BlogSidebar/styles.module.css b/website/src/theme/BlogSidebar/Desktop/styles.module.css similarity index 61% rename from website/src/theme/BlogSidebar/styles.module.css rename to website/src/theme/BlogSidebar/Desktop/styles.module.css index 5c64cc008ef..ab6c3afb61f 100644 --- a/website/src/theme/BlogSidebar/styles.module.css +++ b/website/src/theme/BlogSidebar/Desktop/styles.module.css @@ -1,26 +1,18 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - .sidebar { max-height: calc(100vh - (var(--ifm-navbar-height) + 2rem)); overflow-y: auto; position: sticky; top: calc(var(--ifm-navbar-height) + 2rem); - margin-top: 1rem; } .sidebarItemTitle { - font-size: var(--ifm-h4-font-size); + font-size: var(--ifm-h3-font-size); font-weight: var(--ifm-font-weight-bold); } .sidebarItemList { - list-style: none; font-size: 0.9rem; + list-style: none; padding-left: 0; } @@ -30,15 +22,18 @@ .sidebarItemLink { color: var(--ifm-font-color-base); + display: block; } + .sidebarItemLink:hover { text-decoration: none; } + .sidebarItemLinkActive { - color: var(--ifm-color-primary); + color: var(--ifm-color-primary) !important; } -@media only screen and (max-width: 996px) { +@media (max-width: 996px) { .sidebar { display: none; } diff --git a/website/src/theme/BlogSidebar/index.js b/website/src/theme/BlogSidebar/index.js index c68112aa0a1..8770a7a0794 100644 --- a/website/src/theme/BlogSidebar/index.js +++ b/website/src/theme/BlogSidebar/index.js @@ -1,71 +1,20 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; -import clsx from 'clsx'; -import Link from '@docusaurus/Link'; -import styles from './styles.module.css'; -import {translate} from '@docusaurus/Translate'; +import {useWindowSize} from '@docusaurus/theme-common'; +import BlogSidebarDesktop from '@theme/BlogSidebar/Desktop'; +import BlogSidebarMobile from '@theme/BlogSidebar/Mobile'; + +/* dbt Customizations: + * Passes tagData to BlogSidebarDesktop to display categories +*/ export default function BlogSidebar({sidebar, tagData}) { - if (sidebar.items.length === 0) { + const windowSize = useWindowSize(); + if (!sidebar?.items.length) { return null; } - - return ( - - ); + // Mobile sidebar doesn't need to be server-rendered + if (windowSize === 'mobile') { + return ; + } + return ; } diff --git a/website/src/theme/BlogTagsPostsPage/index.js b/website/src/theme/BlogTagsPostsPage/index.js index efb78485c44..c5f782a44b8 100644 --- a/website/src/theme/BlogTagsPostsPage/index.js +++ b/website/src/theme/BlogTagsPostsPage/index.js @@ -1,20 +1,28 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; +import clsx from 'clsx'; +import Translate, {translate} from '@docusaurus/Translate'; +import { + PageMetadata, + HtmlClassNameProvider, + ThemeClassNames, + usePluralForm, +} from '@docusaurus/theme-common'; import Link from '@docusaurus/Link'; import BlogLayout from '@theme/BlogLayout'; -import BlogPostItem from '@theme/BlogPostItem'; -import Translate, {translate} from '@docusaurus/Translate'; -import {ThemeClassNames, usePluralForm} from '@docusaurus/theme-common'; // Very simple pluralization: probably good enough for now +import BlogListPaginator from '@theme/BlogListPaginator'; +import SearchMetadata from '@theme/SearchMetadata'; +import BlogPostItems from '@theme/BlogPostItems'; -// dbt Custom +/* dbt Customizations: + * Imports Head and usePluginData + * Gets tag data from plugin data + * Passes thisTagData.title to title prop in BlogLayout + * Shows custom header with title & description +*/ import Head from '@docusaurus/Head'; import {usePluginData} from '@docusaurus/useGlobalData'; +// Very simple pluralization: probably good enough for now function useBlogPostsPlural() { const {selectMessage} = usePluralForm(); return (count) => @@ -27,45 +35,42 @@ function useBlogPostsPlural() { 'Pluralized label for "{count} posts". Use as much plural forms (separated by "|") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)', message: 'One post|{count} posts', }, - { - count, - }, + {count}, ), ); } - -export default function BlogTagsPostsPage(props) { - const {metadata, items, sidebar} = props; - const {allTagsPath, name: tagName, count} = metadata; +function useBlogTagsPostsPageTitle(tag) { const blogPostsPlural = useBlogPostsPlural(); - const title = translate( + return translate( { id: 'theme.blog.tagTitle', description: 'The title of the page for a blog tag', message: '{nPosts} tagged with "{tagName}"', }, - { - nPosts: blogPostsPlural(count), - tagName, - }, + {nPosts: blogPostsPlural(tag.count), tagName: tag.label}, ); +} +function BlogTagsPostsPageMetadata({tag}) { + const title = useBlogTagsPostsPageTitle(tag); + return ( + <> + + + + ); +} +function BlogTagsPostsPageContent({tag, items, sidebar, listMetadata}) { + const title = useBlogTagsPostsPageTitle(tag); - // dbt Custom const { tagData } = usePluginData('docusaurus-build-global-data-plugin'); - const thisTagData = tagData.find(tag => tag.name === tagName) + const thisTagData = tagData.find(item => item.name === tag.label) return ( - -
          + +
          {/* dbt Custom */} {thisTagData ? ( @@ -80,7 +85,7 @@ export default function BlogTagsPostsPage(props) { ) : ''} {/* end dbt Custom */} - + @@ -88,17 +93,20 @@ export default function BlogTagsPostsPage(props) {
          - - {items.map(({content: BlogPostContent}) => ( - - - - ))} + +
          ); } +export default function BlogTagsPostsPage(props) { + return ( + + + + + ); +} diff --git a/website/src/theme/CodeBlock/Content/String.js b/website/src/theme/CodeBlock/Content/String.js new file mode 100644 index 00000000000..ef8a0220dcb --- /dev/null +++ b/website/src/theme/CodeBlock/Content/String.js @@ -0,0 +1,107 @@ +import React from 'react'; +import clsx from 'clsx'; +import {useThemeConfig, usePrismTheme} from '@docusaurus/theme-common'; +import { + parseCodeBlockTitle, + parseLanguage, + parseLines, + containsLineNumbers, + useCodeWordWrap, +} from '@docusaurus/theme-common/internal'; +import Highlight, {defaultProps} from 'prism-react-renderer'; +import Line from '@theme/CodeBlock/Line'; +import CopyButton from '@theme/CodeBlock/CopyButton'; +import WordWrapButton from '@theme/CodeBlock/WordWrapButton'; +import Container from '@theme/CodeBlock/Container'; +import styles from './styles.module.css'; + +/* dbt Customizations: + * Adds custom squashLinks method to + * allow links in markdown. + */ +import squashLinks from './inline-link'; + +export default function CodeBlockString({ + children, + className: blockClassName = '', + metastring, + title: titleProp, + showLineNumbers: showLineNumbersProp, + language: languageProp, +}) { + const { + prism: {defaultLanguage, magicComments}, + } = useThemeConfig(); + const language = + languageProp ?? parseLanguage(blockClassName) ?? defaultLanguage; + const prismTheme = usePrismTheme(); + const wordWrap = useCodeWordWrap(); + // We still parse the metastring in case we want to support more syntax in the + // future. Note that MDX doesn't strip quotes when parsing metastring: + // "title=\"xyz\"" => title: "\"xyz\"" + const title = parseCodeBlockTitle(metastring) || titleProp; + const {lineClassNames, code} = parseLines(children, { + metastring, + language, + magicComments, + }); + const showLineNumbers = + showLineNumbersProp ?? containsLineNumbers(metastring); + return ( + + {title &&
          {title}
          } +
          + + {({className, tokens, getLineProps, getTokenProps}) => ( +
          +              
          +                {tokens.map((line, i) => {
          +                  const squashedLine = squashLinks(line)
          +                  return (
          +                    
          +                  )
          +                })}
          +              
          +            
          + )} +
          +
          + {(wordWrap.isEnabled || wordWrap.isCodeScrollable) && ( + wordWrap.toggle()} + isEnabled={wordWrap.isEnabled} + /> + )} + +
          +
          +
          + ); +} diff --git a/website/src/theme/CodeBlock/inline-link.js b/website/src/theme/CodeBlock/Content/inline-link.js similarity index 97% rename from website/src/theme/CodeBlock/inline-link.js rename to website/src/theme/CodeBlock/Content/inline-link.js index fff05afc777..a8fdb9fce33 100644 --- a/website/src/theme/CodeBlock/inline-link.js +++ b/website/src/theme/CodeBlock/Content/inline-link.js @@ -1,6 +1,6 @@ import React from 'react'; -import Link from '@site/src/components/link'; +import Link from '@docusaurus/Link'; function isMarkdownLink(string) { const regex = /(?\\?)\[(?.*?)\]\((?.*?)\)/ diff --git a/website/src/theme/CodeBlock/styles.module.css b/website/src/theme/CodeBlock/Content/styles.module.css similarity index 58% rename from website/src/theme/CodeBlock/styles.module.css rename to website/src/theme/CodeBlock/Content/styles.module.css index a1bd413658d..6f7bdaa5613 100644 --- a/website/src/theme/CodeBlock/styles.module.css +++ b/website/src/theme/CodeBlock/Content/styles.module.css @@ -5,16 +5,11 @@ * LICENSE file in the root directory of this source tree. */ -.codeBlockContainer { - margin-bottom: var(--ifm-leading); - border-radius: var(--ifm-global-radius); - box-shadow: var(--ifm-global-shadow-lw); -} - .codeBlockContent { position: relative; /* rtl:ignore */ direction: ltr; + border-radius: inherit; } .codeBlockTitle { @@ -22,43 +17,23 @@ font-size: var(--ifm-code-font-size); font-weight: 500; padding: 0.75rem var(--ifm-pre-padding); - border-top-left-radius: var(--ifm-global-radius); - border-top-right-radius: var(--ifm-global-radius); + border-top-left-radius: inherit; + border-top-right-radius: inherit; } .codeBlock { + --ifm-pre-background: var(--prism-background-color); margin: 0; padding: 0; - border-radius: var(--ifm-global-radius); } -.codeBlockTitle + .codeBlockContent .codeBlock { +.codeBlockTitle+.codeBlockContent .codeBlock { border-top-left-radius: 0; border-top-right-radius: 0; } .codeBlockStandalone { padding: 0; - border-radius: var(--ifm-global-radius); -} - -.copyButton { - background: rgb(0 0 0 / 30%); - border-radius: var(--ifm-global-radius); - color: var(--ifm-color-white); - opacity: 0; - user-select: none; - padding: 0.4rem 0.5rem; - position: absolute; - right: calc(var(--ifm-pre-padding) / 2); - top: calc(var(--ifm-pre-padding) / 2); - transition: opacity 200ms ease-in-out; -} - -.copyButton:focus, -.codeBlockContent:hover > .copyButton, -.codeBlockTitle:hover + .codeBlockContent .copyButton { - opacity: 1; } .codeBlockLines { @@ -69,8 +44,43 @@ padding: var(--ifm-pre-padding); } +.codeBlockLinesWithNumbering { + display: table; + padding: var(--ifm-pre-padding) 0; +} + @media print { .codeBlockLines { white-space: pre-wrap; } } + +.buttonGroup { + display: flex; + column-gap: 0.2rem; + position: absolute; + right: calc(var(--ifm-pre-padding) / 2); + top: calc(var(--ifm-pre-padding) / 2); +} + +.buttonGroup button { + display: flex; + align-items: center; + background: var(--prism-background-color); + color: var(--prism-color); + border: 1px solid var(--ifm-color-emphasis-300); + border-radius: var(--ifm-global-radius); + padding: 0.4rem; + line-height: 0; + transition: opacity 200ms ease-in-out; + opacity: 0; +} + +.buttonGroup button:focus-visible, +.buttonGroup button:hover { + opacity: 1 !important; +} + +:global(.theme-code-block:hover) .buttonGroup button { + opacity: 0.4; +} diff --git a/website/src/theme/CodeBlock/index.js b/website/src/theme/CodeBlock/index.js deleted file mode 100644 index 2a9a9bac488..00000000000 --- a/website/src/theme/CodeBlock/index.js +++ /dev/null @@ -1,183 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, {isValidElement, useEffect, useState} from 'react'; -import clsx from 'clsx'; -import Highlight, {defaultProps} from 'prism-react-renderer'; -import copy from 'copy-text-to-clipboard'; -import Translate, {translate} from '@docusaurus/Translate'; -import { - useThemeConfig, - parseCodeBlockTitle, - parseLanguage, - parseLines, - ThemeClassNames, - usePrismTheme, -} from '@docusaurus/theme-common'; -import styles from './styles.module.css'; - /* D.B. The only code change from the base swizzled component. This - * adds a squashLinks function that condenses markdown links in the - * generated yml tokens into a single Link token - * - * See the usage of squashLinks below - */ - import squashLinks from './inline-link'; - -/* pushing comment commit to trigger rebuild, will delete */ - -export default function CodeBlock({ - children, - className: blockClassName = '', - metastring, - title, - language: languageProp, -}) { - const {prism} = useThemeConfig(); - const [showCopied, setShowCopied] = useState(false); - const [mounted, setMounted] = useState(false); // The Prism theme on SSR is always the default theme but the site theme - // can be in a different mode. React hydration doesn't update DOM styles - // that come from SSR. Hence force a re-render after mounting to apply the - // current relevant styles. There will be a flash seen of the original - // styles seen using this current approach but that's probably ok. Fixing - // the flash will require changing the theming approach and is not worth it - // at this point. - - - useEffect(() => { - setMounted(true); - }, []); // We still parse the metastring in case we want to support more syntax in the - // future. Note that MDX doesn't strip quotes when parsing metastring: - // "title=\"xyz\"" => title: "\"xyz\"" - - const codeBlockTitle = parseCodeBlockTitle(metastring) || title; - const prismTheme = usePrismTheme(); //
           tags in markdown map to CodeBlocks and they may contain JSX children.
          -  // When the children is not a simple string, we just return a styled block
          -  // without actually highlighting.
          -
          -  if (React.Children.toArray(children).some((el) => isValidElement(el))) {
          -    return (
          -      
          -        {({className, style}) => (
          -          
          -            {children}
          -          
          - )} -
          - ); - } // The children is now guaranteed to be one/more plain strings - - const content = Array.isArray(children) ? children.join('') : children; - const language = - languageProp ?? parseLanguage(blockClassName) ?? prism.defaultLanguage; - const {highlightLines, code} = parseLines(content, metastring, language); - - const handleCopyCode = () => { - copy(code); - setShowCopied(true); - setTimeout(() => setShowCopied(false), 2000); - }; - - return ( - - {({className, style, tokens, getLineProps, getTokenProps}) => ( -
          - {codeBlockTitle && ( -
          - {codeBlockTitle} -
          - )} -
          -
          -              
          -                {tokens.map((line, i) => {
          -                  if (line.length === 1 && line[0].content === '\n') {
          -                    line[0].content = '';
          -                  }
          -
          -                  const lineProps = getLineProps({
          -                    line,
          -                    key: i,
          -                  });
          -
          -                  if (highlightLines.includes(i)) {
          -                    lineProps.className += ' docusaurus-highlight-code-line';
          -                  }
          -
          -                  return (
          -                    
          -                       {squashLinks(line).map((token, key) => (
          -                         
          -                       ))}
          -                       
          -
          - ); - })} -
          -
          - - -
          -
          - )} -
          - ); -} diff --git a/website/src/theme/DocCard/index.js b/website/src/theme/DocCard/index.js index 3eecf1f9007..015dcca746c 100644 --- a/website/src/theme/DocCard/index.js +++ b/website/src/theme/DocCard/index.js @@ -1,50 +1,57 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; -import Link from '@docusaurus/Link'; -import {findFirstCategoryLink, useDocById} from '@docusaurus/theme-common'; import clsx from 'clsx'; -import styles from './styles.module.css'; +import Link from '@docusaurus/Link'; +import { + findFirstCategoryLink, + useDocById, +} from '@docusaurus/theme-common/internal'; import isInternalUrl from '@docusaurus/isInternalUrl'; import {translate} from '@docusaurus/Translate'; +import styles from './styles.module.css'; + +/* dbt Customizations: + * Add styles.glossaryCard to CardContainer + * Add hoverSnippet prop to CardLayout + * Prevent truncate if card links to /terms/ page + * Show hoverSnippet text instead of description if set + * Get hoverSnippet from frontmatter and pass to CardLayout +*/ function CardContainer({href, children}) { - const className = clsx( - 'card margin-bottom--lg padding--lg', - styles.cardContainer, - href && styles.cardContainerLink, - href.includes('/terms/') && styles.glossaryCard - ); - return href ? ( - + return ( + {children} - ) : ( -
          {children}
          ); } - function CardLayout({href, icon, title, description, hoverSnippet}) { return ( - +

          {icon} {title}

          -
          - {hoverSnippet ? hoverSnippet : description} -
          + {description && ( +

          + {hoverSnippet ? hoverSnippet : description} +

          + )}
          ); } - function CardCategory({item}) { const href = findFirstCategoryLink(item); + // Unexpected: categories that don't have a link have been filtered upfront + if (!href) { + return null; + } return ( ); } - function CardLink({item}) { const icon = isInternalUrl(item.href) ? '📄️' : '🔗'; const doc = useDocById(item.docId ?? undefined); @@ -77,7 +81,7 @@ function CardLink({item}) { hoverSnippet = file.frontMatter.hoverSnippet } } - + return ( ); } - export default function DocCard({item}) { switch (item.type) { case 'link': return ; - case 'category': return ; - default: throw new Error(`unknown item type ${JSON.stringify(item)}`); } -} +} \ No newline at end of file diff --git a/website/src/theme/DocCard/styles.module.css b/website/src/theme/DocCard/styles.module.css index 97a1b846f48..3b691fb0b42 100644 --- a/website/src/theme/DocCard/styles.module.css +++ b/website/src/theme/DocCard/styles.module.css @@ -1,45 +1,29 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - .cardContainer { - height: 8rem; - color: var(--ifm-color-emphasis-800); --ifm-link-color: var(--ifm-color-emphasis-800); - --ifm-link-hover-color: var(--ifm-color-emphasis-800); + --ifm-link-hover-color: var(--ifm-color-emphasis-700); --ifm-link-hover-decoration: none; - /* box-shadow: var(--ifm-global-shadow-lw); */ box-shadow: 0 1.5px 3px 0 rgb(0 0 0 / 15%); border: 1px solid var(--ifm-color-emphasis-200); - transition: box-shadow var(--ifm-transition-fast) ease, - background-color var(--ifm-transition-fast) ease; -} - -.cardContainer.cardContainerLink:hover { - /* box-shadow: var(--ifm-global-shadow-md); */ - box-shadow: 0 4px 8px 0 rgb(0 0 0 / 20%); + transition: all var(--ifm-transition-fast) ease; + transition-property: border, box-shadow; } -[data-theme='dark'] .cardContainer.cardContainerLink:hover { - --ifm-card-background-color: #2d2d2d; /* original, non-hovered color is #242526 */ +.cardContainer:hover { + border-color: var(--ifm-color-primary); + box-shadow: 0 3px 6px 0 rgb(0 0 0 / 20%); } -.cardContainer:not(.cardContainerLink) { - cursor: not-allowed; +.cardContainer *:last-child { + margin-bottom: 0; } .cardTitle { font-size: 1.2rem; - min-height: 1.2rem; } .cardDescription { font-size: 0.8rem; - min-height: 0.8rem; } .glossaryCard { diff --git a/website/src/theme/DocItem/Content/index.js b/website/src/theme/DocItem/Content/index.js new file mode 100644 index 00000000000..faca59ee2f3 --- /dev/null +++ b/website/src/theme/DocItem/Content/index.js @@ -0,0 +1,74 @@ +import React from "react"; +import clsx from "clsx"; +import { ThemeClassNames } from "@docusaurus/theme-common"; +import { useDoc } from "@docusaurus/theme-common/internal"; +import Heading from "@theme/Heading"; +import MDXContent from "@theme/MDXContent"; +/** + Title can be declared inside md content or declared through + front matter and added manually. To make both cases consistent, + the added title is added under the same div.markdown block + See https://github.com/facebook/docusaurus/pull/4882#issuecomment-853021120 + + We render a "synthetic title" if: + - user doesn't ask to hide it with front matter + - the markdown content does not already contain a top-level h1 heading +*/ + +/* dbt Customizations: + * Import custom CommunitySpotlightCard component + * Get metadata from useDoc within DocItemContent component + * Check if spotlight member page + * If so, use component rather than header + */ +import CommunitySpotlightCard from "@site/src/components/communitySpotlightCard"; +import QuickstartTOC from "@site/src/components/quickstartTOC"; +import styles from "./styles.module.css"; + +function useSyntheticTitle() { + const { metadata, frontMatter, contentTitle } = useDoc(); + const shouldRender = + !frontMatter.hide_title && typeof contentTitle === "undefined"; + if (!shouldRender) { + return null; + } + return metadata.title; +} +export default function DocItemContent({ children }) { + const syntheticTitle = useSyntheticTitle(); + + // dbt Custom + const { metadata, frontMatter } = useDoc(); + const isSpotlightMember = metadata?.id?.includes("community/spotlight/"); + const isQuickstartGuide = metadata?.id?.startsWith("quickstarts/"); + + return ( +
          + {syntheticTitle && !isSpotlightMember && ( +
          + {syntheticTitle} +
          + )} + + {/* Wrap with small container if spotlight member page */} + {isSpotlightMember ? ( +
          + + {children} +
          + ) : isQuickstartGuide ? ( +
          + +
          + {children} +
          +
          + ) : ( + {children} + )} +
          + ); +} diff --git a/website/src/theme/DocItem/Content/styles.module.css b/website/src/theme/DocItem/Content/styles.module.css new file mode 100644 index 00000000000..72d0fd0f9ad --- /dev/null +++ b/website/src/theme/DocItem/Content/styles.module.css @@ -0,0 +1,4 @@ +.spotlightMemberContain { + max-width: 800px; + margin: 0 auto; +} diff --git a/website/src/theme/DocItem/Layout/index.js b/website/src/theme/DocItem/Layout/index.js new file mode 100644 index 00000000000..8cc89af027b --- /dev/null +++ b/website/src/theme/DocItem/Layout/index.js @@ -0,0 +1,182 @@ +import React, { useState, useEffect, useContext } from 'react'; +import clsx from 'clsx'; +import {useWindowSize} from '@docusaurus/theme-common'; +import {useDoc} from '@docusaurus/theme-common/internal'; +import DocItemPaginator from '@theme/DocItem/Paginator'; +import DocVersionBanner from '@theme/DocVersionBanner'; +import DocVersionBadge from '@theme/DocVersionBadge'; +import DocItemFooter from '@theme/DocItem/Footer'; +import DocItemContent from '@theme/DocItem/Content'; +import DocBreadcrumbs from '@theme/DocBreadcrumbs'; +import styles from './styles.module.css'; + +/* dbt Customizations: + * Import TOC & TOCCollapsible components, ThemeClassNames, VersionContext & getElements util + * Get metadata from useDoc() + * Replace DocItemTOCDesktop with TOC component + * to avoid swizzling DocItemTOCDesktop component. + * Pass custom featured_cta and editUrl props to TOC + * Get headers and rebuild ToC to hide headers not available in current version + * Show ToC if tocReady = true + * Add tocLoader styles +*/ +import DocSearchWeight from '@site/src/components/docSearchWeight'; +import TOC from '@theme/TOC'; +import TOCCollapsible from '@theme/TOCCollapsible'; +import {ThemeClassNames} from '@docusaurus/theme-common'; +import VersionContext from '../../../stores/VersionContext' +import getElements from '../../../utils/get-html-elements'; +import useHashLink from '../../../utils/use-hash-link'; + +/** + * Decide if the toc should be rendered, on mobile or desktop viewports + */ +function useDocTOC() { + const {frontMatter, toc, metadata} = useDoc(); + + // dbt Custom: If term has cta property set, show that cta + const termCTA = frontMatter?.cta && frontMatter.cta + + // This hides any TOC items not in + // html markdown headings for current version. + const { version: dbtVersion } = useContext(VersionContext) + const [currentToc, setCurrentToc] = useState(toc) + const [tocReady, setTocReady] = useState(true) + + async function fetchElements() { + // get html elements + const headings = await getElements(".markdown h1, .markdown h2, .markdown h3, .markdown h4, .markdown h5, .markdown h6") + + // if headings exist on page + // compare against toc + if(toc && headings && headings.length) { + // make new TOC object + let updated = Array.from(headings).reduce((acc, item) => { + // If heading id and toc item id match found + // include in updated toc + let found = toc.find(heading => + heading.id.includes(item.id) + ) + // If toc item is not in headings + // do not include in toc + // This means heading is versioned + + let makeToc = (heading) => { + let level; + if (heading.nodeName === "H2") { + level = 2 + } else if (heading.nodeName === "H3") { + level = 3 + } else { + level = null + } + + return { + value: heading.innerHTML, + id: heading.id, + level: level && level + } + } + + if (found) { + acc.push(makeToc(item)) + } else if (!found) { + acc.push(makeToc(item)) + } else { + null + } + + return acc + }, []) + + // If updated toc different than current + // If so, show loader and update toc + if(currentToc !== updated) { + setTocReady(false) + // This timeout provides enough time to show the loader + // Otherwise the content updates immediately + // and toc content appears to flash with updates + setTimeout(() => { + setCurrentToc(updated) + setTocReady(true) + }, 500) + } else { + setTocReady(true) + } + } else { + setTocReady(true) + } + useHashLink() + } + + useEffect(() => { + fetchElements() + }, [toc, dbtVersion]) + + // end dbt Custom + + const windowSize = useWindowSize(); + const hidden = frontMatter.hide_table_of_contents; + const canRender = !hidden && toc.length > 0; + const mobile = canRender ? : undefined; + const desktop = + canRender && (windowSize === 'desktop' || windowSize === 'ssr') ? ( + <> + {tocReady ? ( + + ) : ( + Loading + )} + + ) : undefined; + return { + hidden, + mobile, + desktop, + }; +} +export default function DocItemLayout({children}) { + const docTOC = useDocTOC(); + + // dbt Custom + // If the page has a search_weight value, apply that value + const {frontMatter} = useDoc(); + const searchWeight = frontMatter?.search_weight && frontMatter.search_weight + + return ( +
          +
          + +
          +
          + + + {docTOC.mobile} + {children} + + +
          + +
          +
          + {docTOC.desktop &&
          {docTOC.desktop}
          } +
          + ); +} diff --git a/website/src/theme/DocItem/styles.module.css b/website/src/theme/DocItem/Layout/styles.module.css similarity index 65% rename from website/src/theme/DocItem/styles.module.css rename to website/src/theme/DocItem/Layout/styles.module.css index 27b8b4b1819..fb5d5fff254 100644 --- a/website/src/theme/DocItem/styles.module.css +++ b/website/src/theme/DocItem/Layout/styles.module.css @@ -1,10 +1,3 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - .docItemContainer header + *, .docItemContainer article > *:first-child { margin-top: 0; @@ -14,13 +7,21 @@ .docItemCol { max-width: 75% !important; } +} +@media (min-width: 997px) { /* Prevent hydration FOUC, as the mobile TOC needs to be server-rendered */ .tocMobile { display: none; } } +@media print { + .tocMobile { + display: none; + } +} + .tocLoader { max-width: 100px; display: block; diff --git a/website/src/theme/DocItem/index.js b/website/src/theme/DocItem/index.js deleted file mode 100644 index 5337c58b808..00000000000 --- a/website/src/theme/DocItem/index.js +++ /dev/null @@ -1,213 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, { useState, useEffect, useContext } from 'react'; -import clsx from 'clsx'; -import DocPaginator from '@theme/DocPaginator'; -import DocVersionBanner from '@theme/DocVersionBanner'; -import DocVersionBadge from '@theme/DocVersionBadge'; -import Seo from '@theme/Seo'; -import DocItemFooter from '@theme/DocItemFooter'; -import TOC from '@theme/TOC'; -import TOCCollapsible from '@theme/TOCCollapsible'; -import Heading from '@theme/Heading'; -import styles from './styles.module.css'; -import { ThemeClassNames, useWindowSize } from '@docusaurus/theme-common'; -import DocBreadcrumbs from '@theme/DocBreadcrumbs'; -import DocSearchWeight from '@site/src/components/docSearchWeight'; - -// dbt Custom -import VersionContext from '../../stores/VersionContext' -import getElements from '../../utils/get-html-elements'; - -export default function DocItem(props) { - const { content: DocContent } = props; - const { metadata, frontMatter, assets } = DocContent; - const { - keywords, - hide_title: hideTitle, - hide_table_of_contents: hideTableOfContents, - toc_min_heading_level: tocMinHeadingLevel, - toc_max_heading_level: tocMaxHeadingLevel, - } = frontMatter; - const { description, title } = metadata; - const image = assets.image ?? frontMatter.image; // We only add a title if: - // - user asks to hide it with front matter - // - the markdown content does not already contain a top-level h1 heading - - const shouldAddTitle = - !hideTitle && typeof DocContent.contentTitle === 'undefined'; - const windowSize = useWindowSize(); - const canRenderTOC = - !hideTableOfContents && DocContent.toc && DocContent.toc.length > 0; - const renderTocDesktop = - canRenderTOC && (windowSize === 'desktop' || windowSize === 'ssr'); - - // dbt Custom - // If term has cta property set, show that cta - const termCTA = frontMatter?.cta && frontMatter.cta - - // dbt Custom - // If the page has a search_weight value, apply that value - const searchWeight = frontMatter?.search_weight && frontMatter.search_weight - - // This hides any TOC items not in - // html markdown headings for current version. - const { version: dbtVersion } = useContext(VersionContext) - const [currentToc, setCurrentToc] = useState(DocContent.toc) - const [tocReady, setTocReady] = useState(true) - useEffect(() => { - async function fetchElements() { - // get html elements - const headings = await getElements(".markdown h1, .markdown h2, .markdown h3, .markdown h4, .markdown h5, .markdown h6") - // if headings exist on page - // compare against toc - if (DocContent.toc && headings && headings.length) { - // make new TOC object - let updated = Array.from(headings).reduce((acc, item) => { - // If heading id and toc item id match found - // include in updated toc - let found = DocContent.toc.find(heading => - heading.id.includes(item.id) - ) - // If toc item is not in headings - // do not include in toc - // This means heading is versioned - - let makeToc = (heading) => { - let level; - if (heading.nodeName === "H2") { - level = 2 - } else if (heading.nodeName === "H3") { - level = 3 - } else { - level = null - } - - return { - value: heading.innerHTML, - id: heading.id, - level: level && level - } - } - - if (found) { - acc.push(makeToc(item)) - } else if (!found) { - acc.push(makeToc(item)) - } else { - null - } - - return acc - }, []) - - // If updated toc different than current - // If so, show loader and update toc - if (currentToc.length !== updated.length) { - setTocReady(false) - // This timeout provides enough time to show the loader - // Otherwise the content updates immediately - // and toc content appears to flash with updates - setTimeout(() => { - setCurrentToc(updated) - setTocReady(true) - }, 500) - } else { - setTocReady(true) - } - } else { - setTocReady(true) - } - } - fetchElements() - }, [DocContent, dbtVersion]) - // end dbt Custom - - return ( - <> - - -
          -
          - -
          -
          - - - - {canRenderTOC && ( - - )} - -
          - {/* - Title can be declared inside md content or declared through - front matter and added manually. To make both cases consistent, - the added title is added under the same div.markdown block - See https://github.com/facebook/docusaurus/pull/4882#issuecomment-853021120 - */} - {shouldAddTitle && ( -
          - {title} -
          - )} - - - - -
          - - -
          - - -
          -
          - {renderTocDesktop && ( -
          - {tocReady ? ( - - ) : ( - Loading - )} -
          - )} -
          - - ); -} diff --git a/website/src/theme/DocPage/Layout/Main/index.js b/website/src/theme/DocPage/Layout/Main/index.js new file mode 100644 index 00000000000..d8f6bc76be8 --- /dev/null +++ b/website/src/theme/DocPage/Layout/Main/index.js @@ -0,0 +1,123 @@ +import React, { useState, useEffect, useContext } from 'react'; +import clsx from 'clsx'; +import {useDocsSidebar, useLocalPathname} from '@docusaurus/theme-common/internal'; +import styles from './styles.module.css'; + +/* dbt Customizations: + * Import Admonition for version banners, and version-related plugin, context, method + * Get page path with useLocalPathname hook + * Check if page available for current version + * Check whether this version is a isPrerelease + * Check End of Life date and show unsupported banner if deprecated version + * useEffect to show banner content + * Show Admonition banners if needed +*/ +import Admonition from '@theme/Admonition'; +import {usePluginData} from '@docusaurus/useGlobalData'; +import VersionContext from '../../../../stores/VersionContext' +import pageVersionCheck from '../../../../utils/page-version-check'; + +export default function DocPageLayoutMain({hiddenSidebarContainer, children}) { + const sidebar = useDocsSidebar(); + + // Get current page path + const currentDocRoute = useLocalPathname() + + // Check if page available for current version + + const { versionedPages } = usePluginData('docusaurus-build-global-data-plugin'); + const { version: dbtVersion, EOLDate, isPrerelease, latestStableRelease } = useContext(VersionContext) + const { pageAvailable, firstAvailableVersion } = pageVersionCheck(dbtVersion, versionedPages, currentDocRoute) + + // Check whether this version is a isPrerelease, and show banner if so + const [PreData, setPreData] = useState({ + showisPrereleaseBanner: false, + isPrereleaseBannerText: '' + }) + + // Check End of Life date and show unsupported banner if deprecated version + const [EOLData, setEOLData] = useState({ + showEOLBanner: false, + EOLBannerText: '' + }) + + useEffect(() => { + // If version is not isPrerelease, do not show banner + if(!isPrerelease) { + setPreData({ + showisPrereleaseBanner: false, + isPrereleaseBannerText: '' + }) + } else { + setPreData({ + showisPrereleaseBanner: true, + isPrereleaseBannerText : `You are currently viewing v${dbtVersion}, which is a prerelease of dbt Core. The latest stable version is v${latestStableRelease}` + }) + } + // If EOLDate not set for version, do not show banner + if(!EOLDate) { + setEOLData({ + showEOLBanner: false, + EOLBannerText: '' + }) + } else { + let threeMonths = new Date(EOLDate) + threeMonths.setMonth(threeMonths.getMonth() - 3) + if(new Date() > new Date(EOLDate)) { + setEOLData({ + showEOLBanner: true, + EOLBannerText: `This version of dbt Core is no longer supported. No patch releases will be made, even for critical security issues. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.` + }) + } else if(new Date() > threeMonths) { + setEOLData({ + showEOLBanner: true, + EOLBannerText: `This version of dbt Core is nearing the end of its critical support period. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.` + }) + } else { + setEOLData({ + showEOLBanner: false, + EOLBannerText: '' + }) + } + } + }, [dbtVersion]) + + return ( +
          +
          + {!pageAvailable && dbtVersion && firstAvailableVersion && ( +
          + +

          Unfortunately, this feature is not available in dbt Core version {dbtVersion}

          +

          You should upgrade to {firstAvailableVersion} or later if you want to use this feature.

          +
          +
          + )} + {PreData.showisPrereleaseBanner && ( +
          + +
          + +
          + )} + {EOLData.showEOLBanner && ( +
          + +
          + +
          + )} + {children} +
          +
          + ); +} diff --git a/website/src/theme/DocPage/Layout/Main/styles.module.css b/website/src/theme/DocPage/Layout/Main/styles.module.css new file mode 100644 index 00000000000..3598915de8f --- /dev/null +++ b/website/src/theme/DocPage/Layout/Main/styles.module.css @@ -0,0 +1,26 @@ +.docMainContainer { + display: flex; + width: 100%; +} + +@media (min-width: 997px) { + .docMainContainer { + flex-grow: 1; + max-width: calc(100% - var(--doc-sidebar-width)); + } + + .docMainContainerEnhanced { + max-width: calc(100% - var(--doc-sidebar-hidden-width)); + } + + .docItemWrapperEnhanced { + max-width: calc( + var(--ifm-container-width) + var(--doc-sidebar-width) + ) !important; + } +} + +/* Page Not Available Banner */ +:local(.versionBanner) h5 { + margin-bottom: 0; +} diff --git a/website/src/theme/DocPage/index.js b/website/src/theme/DocPage/index.js deleted file mode 100644 index 770ba36e6e2..00000000000 --- a/website/src/theme/DocPage/index.js +++ /dev/null @@ -1,255 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, {useState, useCallback, useContext, useEffect} from 'react'; -import {MDXProvider} from '@mdx-js/react'; -import renderRoutes from '@docusaurus/renderRoutes'; -import Layout from '@theme/Layout'; -import DocSidebar from '@theme/DocSidebar'; -import MDXComponents from '@theme/MDXComponents'; -import NotFound from '@theme/NotFound'; -import IconArrow from '@theme/IconArrow'; -import BackToTopButton from '@theme/BackToTopButton'; -import {matchPath} from '@docusaurus/router'; -import {translate} from '@docusaurus/Translate'; -import clsx from 'clsx'; -import styles from './styles.module.css'; -import { - ThemeClassNames, - docVersionSearchTag, - DocsSidebarProvider, - useDocsSidebar, - DocsVersionProvider, -} from '@docusaurus/theme-common'; -import Head from '@docusaurus/Head'; -import Admonition from '@theme/Admonition'; -import {usePluginData} from '@docusaurus/useGlobalData'; -import VersionContext from '../../stores/VersionContext' -import pageVersionCheck from '../../utils/page-version-check'; - -function DocPageContent({ - currentDocRoute, - versionMetadata, - children, - sidebarName, -}) { - const sidebar = useDocsSidebar(); - const {pluginId, version} = versionMetadata; - const [hiddenSidebarContainer, setHiddenSidebarContainer] = useState(false); - const [hiddenSidebar, setHiddenSidebar] = useState(false); - const toggleSidebar = useCallback(() => { - if (hiddenSidebar) { - setHiddenSidebar(false); - } - - setHiddenSidebarContainer((value) => !value); - }, [hiddenSidebar]); - - // Check if page available for current version - const { versionedPages } = usePluginData('docusaurus-build-global-data-plugin'); - const { version: dbtVersion, EOLDate, isPrerelease, latestStableRelease } = useContext(VersionContext) - const { pageAvailable, firstAvailableVersion } = pageVersionCheck(dbtVersion, versionedPages, currentDocRoute.path) - - // Check whether this version is a isPrerelease, and show banner if so - const [PreData, setPreData] = useState({ - showisPrereleaseBanner: false, - isPrereleaseBannerText: '' - }) - - // Check End of Life date and show unsupported banner if deprecated version - const [EOLData, setEOLData] = useState({ - showEOLBanner: false, - EOLBannerText: '' - }) - - useEffect(() => { - // If version is not isPrerelease, do not show banner - if(!isPrerelease) { - setPreData({ - showisPrereleaseBanner: false, - isPrereleaseBannerText: '' - }) - } else { - setPreData({ - showisPrereleaseBanner: true, - isPrereleaseBannerText : `You are currently viewing v${dbtVersion}, which is a prerelease of dbt Core. The latest stable version is v${latestStableRelease}` - }) - } - // If EOLDate not set for version, do not show banner - if(!EOLDate) { - setEOLData({ - showEOLBanner: false, - EOLBannerText: '' - }) - } else { - let threeMonths = new Date(EOLDate) - threeMonths.setMonth(threeMonths.getMonth() - 3) - if(new Date() > new Date(EOLDate)) { - setEOLData({ - showEOLBanner: true, - EOLBannerText: `This version of dbt Core is no longer supported. No patch releases will be made, even for critical security issues. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.` - }) - } else if(new Date() > threeMonths) { - setEOLData({ - showEOLBanner: true, - EOLBannerText: `This version of dbt Core is nearing the end of its critical support period. For better performance, improved security, and new features, you should upgrade to ${latestStableRelease}, the latest stable version.` - }) - } else { - setEOLData({ - showEOLBanner: false, - EOLBannerText: '' - }) - } - } - }, [dbtVersion]) - - return ( - -
          - - - {sidebar && ( - - )} -
          -
          - {!pageAvailable && dbtVersion && firstAvailableVersion && ( -
          - -

          Unfortunately, this feature is not available in dbt Core version {dbtVersion}

          -

          You should upgrade to {firstAvailableVersion} or later if you want to use this feature.

          -
          -
          - )} - {PreData.showisPrereleaseBanner && ( -
          - -
          - -
          - )} - {EOLData.showEOLBanner && ( -
          - -
          - -
          - )} - {children} -
          -
          -
          -
          - ); -} - -function DocPage(props) { - const { - route: {routes: docRoutes}, - versionMetadata, - location, - } = props; - const currentDocRoute = docRoutes.find((docRoute) => - matchPath(location.pathname, docRoute), - ); - - if (!currentDocRoute) { - return ; - } // For now, the sidebarName is added as route config: not ideal! - - const sidebarName = currentDocRoute.sidebar; - const sidebar = sidebarName - ? versionMetadata.docsSidebars[sidebarName] - : null; - return ( - <> - - {/* TODO we should add a core addRoute({htmlClassName}) generic plugin option */} - - - - - - {renderRoutes(docRoutes, { - versionMetadata, - })} - - - - - ); -} - -export default DocPage; diff --git a/website/src/theme/DocPage/styles.module.css b/website/src/theme/DocPage/styles.module.css deleted file mode 100644 index 0c1e3b60e59..00000000000 --- a/website/src/theme/DocPage/styles.module.css +++ /dev/null @@ -1,90 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -:root { - --doc-sidebar-width: 300px; - --doc-sidebar-hidden-width: 30px; -} - -:global(.docs-wrapper) { - display: flex; -} - -.docPage, -.docMainContainer { - display: flex; - width: 100%; -} - -.docSidebarContainer { - display: none; -} - -@media (min-width: 997px) { - .docMainContainer { - flex-grow: 1; - max-width: calc(100% - var(--doc-sidebar-width)); - } - - .docMainContainerEnhanced { - max-width: calc(100% - var(--doc-sidebar-hidden-width)); - } - - .docSidebarContainer { - display: block; - width: var(--doc-sidebar-width); - margin-top: calc(-1 * var(--ifm-navbar-height)); - border-right: 1px solid var(--ifm-toc-border-color); - will-change: width; - transition: width var(--ifm-transition-fast) ease; - clip-path: inset(0); - } - - .docSidebarContainerHidden { - width: var(--doc-sidebar-hidden-width); - cursor: pointer; - } - - .collapsedDocSidebar { - position: sticky; - top: 0; - height: 100%; - max-height: 100vh; - display: flex; - align-items: center; - justify-content: center; - transition: background-color var(--ifm-transition-fast) ease; - } - - .collapsedDocSidebar:hover, - .collapsedDocSidebar:focus { - background-color: var(--ifm-color-emphasis-200); - } - - .expandSidebarButtonIcon { - transform: rotate(0); - } - html[dir='rtl'] .expandSidebarButtonIcon { - transform: rotate(180deg); - } - - html[data-theme='dark'] .collapsedDocSidebar:hover, - html[data-theme='dark'] .collapsedDocSidebar:focus { - background-color: var(--collapse-button-bg-color-dark); - } - - .docItemWrapperEnhanced { - max-width: calc( - var(--ifm-container-width) + var(--doc-sidebar-width) - ) !important; - } -} - -/* Page Not Available Banner */ -:local(.versionBanner) h5 { - margin-bottom: 0; -} diff --git a/website/src/theme/DocSidebarItem/index.js b/website/src/theme/DocSidebarItem/index.js index 6aa73966027..6b60432a611 100644 --- a/website/src/theme/DocSidebarItem/index.js +++ b/website/src/theme/DocSidebarItem/index.js @@ -1,234 +1,46 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, {useEffect, memo, useMemo, useContext} from 'react'; -import clsx from 'clsx'; -import { - isActiveSidebarItem, - usePrevious, - Collapsible, - useCollapsible, - findFirstCategoryLink, - ThemeClassNames, -} from '@docusaurus/theme-common'; -import Link from '@docusaurus/Link'; -import isInternalUrl from '@docusaurus/isInternalUrl'; -import {translate} from '@docusaurus/Translate'; -import IconExternalLink from '@theme/IconExternalLink'; -import styles from './styles.module.css'; -import useIsBrowser from '@docusaurus/useIsBrowser'; // Optimize sidebar at each "level" -// TODO this item should probably not receive the "activePath" props -// TODO this triggers whole sidebar re-renders on navigation - +import React, { useContext } from 'react'; +import DocSidebarItemCategory from '@theme/DocSidebarItem/Category'; +import DocSidebarItemLink from '@theme/DocSidebarItem/Link'; +import DocSidebarItemHtml from '@theme/DocSidebarItem/Html'; + +/* dbt Customizations: + * Import version context and utils + * Get versionedPages from website/dbt-versions.js + * Get version from context + * Hide sidebar item if not available for current version +*/ import {usePluginData} from '@docusaurus/useGlobalData'; import VersionContext from '../../stores/VersionContext' import pageVersionCheck from '../../utils/page-version-check'; +import categoryVersionCheck from '../../utils/category-version-check'; -export const DocSidebarItems = memo(({items, ...props}) => { - const { versionedPages } = usePluginData('docusaurus-build-global-data-plugin'); - return ( - <> - {items.map((item, index) => ( - - ))} - - ); -}) +export default function DocSidebarItem({item, ...props}) { -export default function DocSidebarItem({item, versionedPages, ...props}) { + // dbt Custom + const { versionedPages, versionedCategories } = usePluginData('docusaurus-build-global-data-plugin'); const { version } = useContext(VersionContext) - + + // Hide versionedPages if they do not match the current version if(version && versionedPages) { const { pageAvailable } = pageVersionCheck(version, versionedPages, item.docId) if(!pageAvailable) return null } + // Hide versionedCategories if they do not match the current version + if(version && versionedCategories && item.type === 'category') { + const { categoryAvailable } = categoryVersionCheck(version, versionedCategories, item.label) + if(!categoryAvailable) + return null + } + switch (item.type) { case 'category': - if (item.items.length === 0) { - return null; - } - return ; - + case 'html': + return ; case 'link': default: return ; } -} // If we navigate to a category and it becomes active, it should automatically expand itself - -function useAutoExpandActiveCategory({isActive, collapsed, setCollapsed}) { - const wasActive = usePrevious(isActive); - useEffect(() => { - const justBecameActive = isActive && !wasActive; - - if (justBecameActive && collapsed) { - setCollapsed(false); - } - }, [isActive, wasActive, collapsed, setCollapsed]); -} // When a collapsible category has no link, we still link it to its first child during SSR as a temporary fallback -// This allows to be able to navigate inside the category even when JS fails to load, is delayed or simply disabled -// React hydration becomes an optional progressive enhancement -// see https://github.com/facebookincubator/infima/issues/36#issuecomment-772543188 -// see https://github.com/facebook/docusaurus/issues/3030 - -function useCategoryHrefWithSSRFallback(item) { - const isBrowser = useIsBrowser(); - return useMemo(() => { - if (item.href) { - return item.href; - } // In these cases, it's not necessary to render a fallback - // We skip the "findFirstCategoryLink" computation - - if (isBrowser || !item.collapsible) { - return undefined; - } - - return findFirstCategoryLink(item); - }, [item, isBrowser]); -} - -function DocSidebarItemCategory({ - item, - onItemClick, - activePath, - level, - ...props -}) { - const {items, label, collapsible, className, href} = item; - const hrefWithSSRFallback = useCategoryHrefWithSSRFallback(item); - const isActive = isActiveSidebarItem(item, activePath); - const {collapsed, setCollapsed, toggleCollapsed} = useCollapsible({ - // active categories are always initialized as expanded - // the default (item.collapsed) is only used for non-active categories - initialState: () => { - if (!collapsible) { - return false; - } - - return isActive ? false : item.collapsed; - }, - }); - useAutoExpandActiveCategory({ - isActive, - collapsed, - setCollapsed, - }); - return ( -
        • -
          - { - onItemClick?.(item); - - if (href) { - setCollapsed(false); - } else { - e.preventDefault(); - toggleCollapsed(); - } - } - : () => { - onItemClick?.(item); - } - } - href={collapsible ? hrefWithSSRFallback ?? '#' : hrefWithSSRFallback} - {...props}> - {label} - - {href && collapsible && ( -
          - - - - -
        • - ); -} - -function DocSidebarItemLink({item, onItemClick, activePath, level, ...props}) { - const {href, label, className} = item; - const isActive = isActiveSidebarItem(item, activePath); - return ( -
        • - onItemClick(item) : undefined, - })} - {...props}> - {isInternalUrl(href) ? ( - label - ) : ( - - {label} - - - )} - -
        • - ); } diff --git a/website/src/theme/DocSidebarItem/styles.module.css b/website/src/theme/DocSidebarItem/styles.module.css deleted file mode 100644 index 2fcd42e59fe..00000000000 --- a/website/src/theme/DocSidebarItem/styles.module.css +++ /dev/null @@ -1,19 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -@media (min-width: 997px) { - .menuLinkText { - cursor: initial; - } - .menuLinkText:hover { - background: none; - } - - .menuLinkText.hasHref { - cursor: pointer; - } -} diff --git a/website/src/theme/MDXComponents/index.js b/website/src/theme/MDXComponents/index.js index c9514a320b3..f21b0b84452 100644 --- a/website/src/theme/MDXComponents/index.js +++ b/website/src/theme/MDXComponents/index.js @@ -1,19 +1,17 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, { isValidElement } from "react"; -import Head from "@docusaurus/Head"; -import CodeBlock from "@theme/CodeBlock"; -import Heading from "@theme/Heading"; -import Details from "@theme/Details"; -import "./styles.css"; // MDX elements are wrapped through the MDX pragma. In some cases (notably usage -// with Head/Helmet) we need to unwrap those elements. +import React from 'react'; +import MDXHead from '@theme/MDXComponents/Head'; +import MDXCode from '@theme/MDXComponents/Code'; +import MDXPre from '@theme/MDXComponents/Pre'; +import MDXDetails from '@theme/MDXComponents/Details'; +import MDXHeading from '@theme/MDXComponents/Heading'; +import MDXUl from '@theme/MDXComponents/Ul'; +import MDXImg from '@theme/MDXComponents/Img'; +import MDXA from '@theme/MDXComponents/A'; +import Admonition from '@theme/Admonition'; +import Mermaid from '@theme/Mermaid'; -/* - * docs.getdbt.com additions: +/* dbt Customizations: + * Imports the following components below for export */ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem' @@ -25,7 +23,6 @@ import FAQ from '@site/src/components/faqs'; import FAQList from '@site/src/components/faqList'; import File from '@site/src/components/file'; import Lightbox from '@site/src/components/lightbox'; -import Link from '@site/src/components/link'; import LoomVideo from '@site/src/components/loom'; import Snippet from '@site/src/components/snippet'; import YoutubeVideo from '@site/src/components/youtube'; @@ -34,76 +31,33 @@ import VersionBlock from '@site/src/components/versionBlock'; import Var from '@site/src/components/variable'; import Term from '@site/src/components/term'; import EventsFeed from '@site/src/components/events'; -import { DiscourseFeed, DiscourseHelpFeed } from '@site/src/components/discourse'; +import { DiscourseFeed, DiscourseHelpFeed } from '@site/src/components/discourse'; import Hero from '@site/src/components/hero' import Card from '@site/src/components/card' import Callout from '@site/src/components/callout' import BlogPostCard from '@site/src/components/blogPostCard'; +import DocCarousel from '@site/src/components/docCarousel'; import PostCarousel from '@site/src/components/postCarousel'; - -function unwrapMDXElement(element) { - if (element?.props?.mdxType && element?.props?.originalType) { - const { mdxType, originalType, ...newProps } = element.props; - return React.createElement(element.props.originalType, newProps); - } - - return element; -} +import CommunitySpotlightCard from '@site/src/components/communitySpotlightCard'; +import CommunitySpotlightList from '@site/src/components/communitySpotlightList'; +import dbtEditor from '@site/src/components/dbt-editor'; const MDXComponents = { - head: (props) => { - const unwrappedChildren = React.Children.map(props.children, (child) => - unwrapMDXElement(child) - ); - return {unwrappedChildren}; - }, - code: (props) => { - const inlineElements = [ - "a", - "b", - "big", - "i", - "span", - "em", - "strong", - "sup", - "sub", - "small", - ]; - const shouldBeInline = React.Children.toArray(props.children).every( - (el) => - (typeof el === "string" && !el.includes("\n")) || - (React.isValidElement(el) && inlineElements.includes(el.props.mdxType)) - ); - return shouldBeInline ? : ; - }, - a: (props) => , - pre: (props) => ( - - ), - details: (props) => { - const items = React.Children.toArray(props.children); // Split summary item from the rest to pass it as a separate prop to the - // Details theme component - - const summary = items.find((item) => item?.props?.mdxType === "summary"); - const children = <>{items.filter((item) => item !== summary)}; - return ( -
          - {children} -
          - ); - }, - h1: (props) => , - h2: (props) => , - h3: (props) => , - h4: (props) => , - h5: (props) => , - h6: (props) => , + head: MDXHead, + code: MDXCode, + a: MDXA, + pre: MDXPre, + details: MDXDetails, + ul: MDXUl, + img: MDXImg, + h1: (props) => , + h2: (props) => , + h3: (props) => , + h4: (props) => , + h5: (props) => , + h6: (props) => , + admonition: Admonition, + mermaid: Mermaid, BlogPostCard: BlogPostCard, Callout: Callout, @@ -111,12 +65,12 @@ const MDXComponents = { Changelog: Changelog, CloudCore: CloudCore, Collapsible: Collapsible, + DocCarousel: DocCarousel, FAQ: FAQ, FAQList: FAQList, File: File, Hero: Hero, Lightbox: Lightbox, - Link: Link, LoomVideo: LoomVideo, PostCarousel: PostCarousel, Tabs: Tabs, @@ -131,6 +85,8 @@ const MDXComponents = { EventsFeed: EventsFeed, DiscourseFeed: DiscourseFeed, DiscourseHelpFeed: DiscourseHelpFeed, - Card: Card, + CommunitySpotlightCard, + CommunitySpotlightList, + dbtEditor: dbtEditor, }; export default MDXComponents; diff --git a/website/src/theme/MDXComponents/styles.css b/website/src/theme/MDXComponents/styles.css deleted file mode 100644 index 3c263a23347..00000000000 --- a/website/src/theme/MDXComponents/styles.css +++ /dev/null @@ -1,15 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - -ul.contains-task-list { - padding-left: 0; - list-style: none; -} - -img { - height: auto; -} diff --git a/website/src/theme/Navbar/Logo/index.js b/website/src/theme/Navbar/Logo/index.js new file mode 100644 index 00000000000..2e598672cb9 --- /dev/null +++ b/website/src/theme/Navbar/Logo/index.js @@ -0,0 +1,25 @@ +import React from 'react'; +import Logo from '@theme/Logo'; + +/* dbt Customizations: + * Import useActiveDocContext hook to check if docs page + * Check if page has sidebar + * If has sidebar, add custom class to logo + * This sets custom styles with right border for docs pages +*/ +import {useActiveDocContext} from '@docusaurus/plugin-content-docs/client'; + +export default function NavbarLogo() { + let hasSidebar = true + const thisDocContext = useActiveDocContext() + if(!thisDocContext?.activeDoc?.sidebar) { + hasSidebar = false + } + return ( + + ); +} diff --git a/website/src/theme/NavbarItem/DropdownNavbarItem.js b/website/src/theme/NavbarItem/DropdownNavbarItem.js index 9dfef4fcf06..8c29daeac21 100644 --- a/website/src/theme/NavbarItem/DropdownNavbarItem.js +++ b/website/src/theme/NavbarItem/DropdownNavbarItem.js @@ -1,50 +1,53 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React, {useState, useRef, useEffect, useContext} from 'react'; +import React, { useState, useRef, useEffect, useContext } from 'react'; import clsx from 'clsx'; import { - isSamePath, + isRegexpStringMatch, useCollapsible, Collapsible, - isRegexpStringMatch, - useLocalPathname, } from '@docusaurus/theme-common'; +import { isSamePath, useLocalPathname } from '@docusaurus/theme-common/internal'; import NavbarNavLink from '@theme/NavbarItem/NavbarNavLink'; import NavbarItem from '@theme/NavbarItem'; -import VersionsNavbarItem from './VersionsNavItem'; -import VersionContext from '../../stores/VersionContext'; -const dropdownLinkActiveClass = 'dropdown__link--active'; +/* dbt Customizations: + * Import VersionsNavbarItem component and context + * Pass versionContext prop into DropdownNavbarItemDesktop or DropdownNavbarItemMobile + * Custom state to handle version dropdown on click + * Show version dropdown on version state change + * Pass versionContext to Comp + * +*/ +// import VersionsNavbarItem from './VersionsNavItem'; +import VersionContext from '../../stores/VersionContext'; +import { versions } from '../../../dbt-versions' function isItemActive(item, localPathname) { if (isSamePath(item.to, localPathname)) { return true; } - if (isRegexpStringMatch(item.activeBaseRegex, localPathname)) { return true; } - if (item.activeBasePath && localPathname.startsWith(item.activeBasePath)) { return true; } - return false; } - function containsActiveItems(items, localPathname) { return items.some((item) => isItemActive(item, localPathname)); } - -function DropdownNavbarItemDesktop({items, position, className, versionContext, ...props}) { +function DropdownNavbarItemDesktop({ + items, + position, + className, + onClick, // eslint-disable-line + versionContext, + ...props +}) { const dropdownRef = useRef(null); const [showDropdown, setShowDropdown] = useState(false); - // handle version dropdown state on click + // dbt Custom: handle version dropdown state on click const [showVersionDropdown, setShowVersionDropdown] = useState(true); useEffect(() => { @@ -52,10 +55,8 @@ function DropdownNavbarItemDesktop({items, position, className, versionContext, if (!dropdownRef.current || dropdownRef.current.contains(event.target)) { return; } - setShowDropdown(false); }; - document.addEventListener('mousedown', handleClickOutside); document.addEventListener('touchstart', handleClickOutside); return () => { @@ -64,7 +65,7 @@ function DropdownNavbarItemDesktop({items, position, className, versionContext, }; }, [dropdownRef]); - // Hide version dropdown on click + // dbt Custom: Hide version dropdown on click // This adds dropdown--version--hide class on line 87 const handleVersionMenuClick = () => { setShowVersionDropdown(false) @@ -87,82 +88,82 @@ function DropdownNavbarItemDesktop({items, position, className, versionContext, 'dropdown--version--hide': !showVersionDropdown, })}> e.preventDefault()} onKeyDown={(e) => { if (e.key === 'Enter') { e.preventDefault(); setShowDropdown(!showDropdown); } - }}> + }} + label={className === "nav-versioning" ? `v${versionContext.version} ${versionContext?.isPrerelease ? "(Beta)" : ""}` : props.children ?? props.label} + > {props.children ?? props.label}
          ); } - function DropdownNavbarItemMobile({ items, className, - position: _position, + position, // eslint-disable-line + onClick, versionContext, - // Need to destructure position from props so that it doesn't get passed on. ...props }) { const localPathname = useLocalPathname(); const containsActive = containsActiveItems(items, localPathname); - const {collapsed, toggleCollapsed, setCollapsed} = useCollapsible({ + const { collapsed, toggleCollapsed, setCollapsed } = useCollapsible({ initialState: () => !containsActive, - }); // Expand/collapse if any item active after a navigation - + }); + // Expand/collapse if any item active after a navigation useEffect(() => { if (containsActive) { setCollapsed(!containsActive); @@ -175,45 +176,46 @@ function DropdownNavbarItemMobile({ })}> { e.preventDefault(); toggleCollapsed(); - }}> + }} + label={className === "nav-versioning" ? `v${versionContext.version} ${versionContext.isPrerelease ? "(Beta)" : ""}` : props.children ?? props.label} + > {props.children ?? props.label} - {items.map((childItemProps, i) => ( - className === "nav-versioning" ? ( - versionContext.updateVersion(e)} - activeClassName="menu__link--active" - {...childItemProps} - key={i} - /> - ) : ( + {items.map((childItemProps, i) => { + childItemProps.label = versions.find((version) => (childItemProps.label == version.version))?.isPrerelease ? `${childItemProps.label} (Beta)` : `${childItemProps.label}`; + return ( versionContext.updateVersion(e) + : onClick + } activeClassName="menu__link--active" {...childItemProps} key={i} /> ) - ))} + } + )} ); } +export default function DropdownNavbarItem({ mobile = false, ...props }) { + const Comp = mobile ? DropdownNavbarItemMobile : DropdownNavbarItemDesktop; -export default function DropdownNavbarItem({mobile = false, ...props}) { + // dbt Custom const versionContext = useContext(VersionContext) - const Comp = mobile ? DropdownNavbarItemMobile : DropdownNavbarItemDesktop; return ; } diff --git a/website/src/theme/NavbarItem/VersionsNavItem.js b/website/src/theme/NavbarItem/VersionsNavItem.js deleted file mode 100644 index d801c72d2c2..00000000000 --- a/website/src/theme/NavbarItem/VersionsNavItem.js +++ /dev/null @@ -1,84 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - import React from 'react'; - import clsx from 'clsx'; - import {getInfimaActiveClassName} from '@theme/NavbarItem/utils'; - - const dropdownLinkActiveClass = 'dropdown__link--active'; - export function NavLink({ - activeBasePath, - activeBaseRegex, - to, - href, - label, - activeClassName = '', - prependBaseUrlToHref, - ...props - }) { - - return ( - - { label } - - ); - } - - function DefaultNavbarItemDesktop({ - className, - isDropdownItem = false, - ...props - }) { - const element = ( - - ); - - if (isDropdownItem) { - return
        • {element}
        • ; - } - - return element; - } - - function DefaultNavbarItemMobile({ - className, - isDropdownItem: _isDropdownItem, - ...props - }) { - return ( -
        • - -
        • - ); - } - - function DefaultNavbarItem({ - mobile = false, - position: _position, - // Need to destructure position from props so that it doesn't get passed on. - ...props - }) { - const Comp = mobile ? DefaultNavbarItemMobile : DefaultNavbarItemDesktop; - return ( - - ); - } - - export default DefaultNavbarItem; diff --git a/website/src/theme/NavbarItem/index.js b/website/src/theme/NavbarItem/index.js deleted file mode 100644 index a45147298bc..00000000000 --- a/website/src/theme/NavbarItem/index.js +++ /dev/null @@ -1,53 +0,0 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ -import React from 'react'; -import DefaultNavbarItem from '@theme/NavbarItem/DefaultNavbarItem'; -import DropdownNavbarItem from '@theme/NavbarItem/DropdownNavbarItem'; -import LocaleDropdownNavbarItem from '@theme/NavbarItem/LocaleDropdownNavbarItem'; -import SearchNavbarItem from '@theme/NavbarItem/SearchNavbarItem'; -const NavbarItemComponents = { - default: () => DefaultNavbarItem, - localeDropdown: () => LocaleDropdownNavbarItem, - search: () => SearchNavbarItem, - dropdown: () => DropdownNavbarItem, - // Need to lazy load these items as we don't know for sure the docs plugin is - // loaded. See https://github.com/facebook/docusaurus/issues/3360 - - /* eslint-disable @typescript-eslint/no-var-requires, global-require */ - docsVersion: () => require('@theme/NavbarItem/DocsVersionNavbarItem').default, - docsVersionDropdown: () => - require('@theme/NavbarItem/DocsVersionDropdownNavbarItem').default, - doc: () => require('@theme/NavbarItem/DocNavbarItem').default, - docSidebar: () => require('@theme/NavbarItem/DocSidebarNavbarItem').default, - /* eslint-enable @typescript-eslint/no-var-requires, global-require */ -}; - -const getNavbarItemComponent = (type) => { - const navbarItemComponentFn = NavbarItemComponents[type]; - - if (!navbarItemComponentFn) { - throw new Error(`No NavbarItem component found for type "${type}".`); - } - - return navbarItemComponentFn(); -}; - -function getComponentType(type, isDropdown) { - // Backward compatibility: navbar item with no type set - // but containing dropdown items should use the type "dropdown" - if (!type || type === 'default') { - return isDropdown ? 'dropdown' : 'default'; - } - - return type; -} - -export default function NavbarItem({type, ...props}) { - const componentType = getComponentType(type, props.items !== undefined); - const NavbarItemComponent = getNavbarItemComponent(componentType); - return ; -} diff --git a/website/src/theme/TOC/index.js b/website/src/theme/TOC/index.js index e3e34f6f6b7..ec29a1be80f 100644 --- a/website/src/theme/TOC/index.js +++ b/website/src/theme/TOC/index.js @@ -1,29 +1,23 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ import React from 'react'; import clsx from 'clsx'; import TOCItems from '@theme/TOCItems'; +import styles from './styles.module.css'; import EditThisPage from '@theme/EditThisPage'; -import styles from './styles.module.css'; // Using a custom className -// This prevents TOC highlighting to highlight TOCInline/TOCCollapsible by mistake - -//dbt Custom import CTA from '../../components/cta'; +// Using a custom className +// This prevents TOCInline/TOCCollapsible getting highlighted by mistake +/* dbt Customizations: + * Import EditThisPage and CTA components + * add featured_cta & editUrl props and elements +*/ const LINK_CLASS_NAME = 'table-of-contents__link toc-highlight'; const LINK_ACTIVE_CLASS_NAME = 'table-of-contents__link--active'; - -function TOC({className, featured_cta, editUrl, ...props}) { - +export default function TOC({className, featured_cta, editUrl, ...props}) { return (
          @@ -34,10 +28,8 @@ function TOC({className, featured_cta, editUrl, ...props}) {
          )} {featured_cta && ( - - )} + + )}
          ); } - -export default TOC; diff --git a/website/src/theme/TOC/styles.module.css b/website/src/theme/TOC/styles.module.css index 4b6d2bcc657..4b5d9f462e0 100644 --- a/website/src/theme/TOC/styles.module.css +++ b/website/src/theme/TOC/styles.module.css @@ -1,10 +1,3 @@ -/** - * Copyright (c) Facebook, Inc. and its affiliates. - * - * This source code is licensed under the MIT license found in the - * LICENSE file in the root directory of this source tree. - */ - .tableOfContents { max-height: calc(100vh - (var(--ifm-navbar-height) + 2rem)); overflow-y: auto; diff --git a/website/src/utils/category-version-check.js b/website/src/utils/category-version-check.js new file mode 100644 index 00000000000..bf75853cd8a --- /dev/null +++ b/website/src/utils/category-version-check.js @@ -0,0 +1,38 @@ +export default function categoryVersionCheck(version, versionedCategories, category) { + let categoryAvailableObj = { + categoryAvailable: true + } + + if (!category) + return categoryAvailableObj + + const itemFound = versionedCategories.find(vcategory => vcategory.category === category) + + if (itemFound) { + + const { firstVersion, lastVersion } = itemFound + const currentVersionVal = parseFloat(version) + const firstVersionVal = parseFloat(firstVersion) || 0 + + categoryAvailableObj.firstAvailableVersion = firstVersion + + // Determine if category within version range + if (lastVersion) { + const lastVersionVal = parseFloat(lastVersion) + // If lastVersion set for category, + // check if current version is higher than lastVersion + // or if current version is less than firstVersion + // If true, remove category in sidebar + if (currentVersionVal > lastVersionVal || currentVersionVal < firstVersionVal) { + categoryAvailableObj.categoryAvailable = false + } + } else if (firstVersionVal > currentVersionVal) { + // If firstVersion is greater than currentVersion + // remove category from sidebar + categoryAvailableObj.categoryAvailable = false + } + } + + return categoryAvailableObj + } + \ No newline at end of file diff --git a/website/src/utils/get-all-posts.js b/website/src/utils/get-all-posts.js index ed2cb251c3c..4431b39d214 100644 --- a/website/src/utils/get-all-posts.js +++ b/website/src/utils/get-all-posts.js @@ -6,7 +6,7 @@ export default function getAllPosts() { */ return ((ctx) => { const blogpostNames = ctx.keys(); - return blogpostNames.reduce((blogposts, blogpostName, i) => { + return blogpostNames.reduce((blogposts, blogpostName) => { const module = ctx(blogpostName); const { image } = module.frontMatter const { date, formattedDate, title, permalink, authors, description, tags } = module.metadata; diff --git a/website/src/utils/get-icon-type.js b/website/src/utils/get-icon-type.js new file mode 100644 index 00000000000..9a2b7194adf --- /dev/null +++ b/website/src/utils/get-icon-type.js @@ -0,0 +1,24 @@ +// Util function to check which icon to render +import React from "react"; +import { useColorMode } from "@docusaurus/theme-common"; + +export default function getIconType(icon, ...styles) { + const { colorMode } = useColorMode(); + const combinedStyles = styles.join(" "); + + if (icon.startsWith("fa-")) { + return ; + } else { + return ( + + ); + } +} diff --git a/website/src/utils/get-spotlight-member.js b/website/src/utils/get-spotlight-member.js new file mode 100644 index 00000000000..1839e2c82a8 --- /dev/null +++ b/website/src/utils/get-spotlight-member.js @@ -0,0 +1,73 @@ +import spotlightData from './../../.docusaurus/docusaurus-build-spotlight-index-page-plugin/default/spotlight-page-data.json' + +/* + * Returns the spotlight member for the + * 'Community spotlight' section on the homepage. + * This tries to: + * 1. Find a spotlight member by the id entered in docusaurus.config.js. + * 2. If not found, find the latest spotlight member by dateCreated property. + * 3. Otherwise, this returns null, and the `featuredResource` + * will show in this section instead. +*/ +export const getSpotlightMember = (spotlightMember) => { + if(!spotlightData) return null + + // Get latest spotlight member + const latestMember = getLatestMember() + + // If spotlightMember not passed in from docusaurus.config.js + // Try to find latest spotlight member by date + if(!spotlightMember) { + return latestMember || null + } else { + // If spotlight member data found from ID, return this member + // Otherwise, return latestMember if available, or return null + const memberFound = findMemberById(spotlightMember) + return memberFound + ? memberFound + : latestMember + ? latestMember : null + } +} + +/* + * Gets latest community spotlight member + * This sorts all members by the `dateCreated` field +*/ +function getLatestMember() { + // Sort members by dateCreated + let sortedMembers = spotlightData.sort((a, b) => { + return new Date(b?.data?.dateCreated) - new Date(a?.data?.dateCreated) + }) + + if(!sortedMembers || sortedMembers?.length <= 0) return null + + // Return latest community spotlight member + const latestMember = sortedMembers[0]?.data + return setAdditionalProperties(latestMember) +} + +/* + * Checks if member available which matches `communitySpotlightMember` + * field within docusaurus.config.js +*/ +function findMemberById(spotlightMember) { + // Find member by ID + const thisMember = spotlightData?.find(member => member?.data?.id === spotlightMember)?.data + + if(!thisMember) return null + + // Return member found by ID + return setAdditionalProperties(thisMember) +} + +/* + * Adds sectionTitle & link properties +*/ +function setAdditionalProperties(member) { + if(!member) return null + const thisMember = member + thisMember.sectionTitle = 'Community spotlight' + thisMember.link = `/community/spotlight/${member.id}` + return thisMember +} diff --git a/website/src/utils/use-hash-link.js b/website/src/utils/use-hash-link.js new file mode 100644 index 00000000000..d771471412a --- /dev/null +++ b/website/src/utils/use-hash-link.js @@ -0,0 +1,23 @@ +export default function useHashLink() { + if (window.location.hash) { + const hashLink = document.getElementById( + window.location.hash.replace("#", "") + ); + if (hashLink) { + // Make sure the nav is hidden if loaded from an anchor link + // Prevents the nav from covering the hash link content + const navbar = document.getElementsByClassName("navbar")[0]; + navbar.classList.add("navbarHiddenOnLoad"); + + let scrollPos = 0; + window.addEventListener('scroll', function(){ + if ((document.body.getBoundingClientRect()).top > scrollPos) { + navbar.classList.remove("navbarHiddenOnLoad"); + } + scrollPos = (document.body.getBoundingClientRect()).top; + }); + + hashLink.scrollIntoView(); + } + } +} diff --git a/website/static/assets/beta-tc.pdf b/website/static/assets/beta-tc.pdf new file mode 100644 index 00000000000..f285cf95a55 Binary files /dev/null and b/website/static/assets/beta-tc.pdf differ diff --git a/website/static/css/featherlight-styles.css b/website/static/css/featherlight-styles.css new file mode 100644 index 00000000000..33760b1b011 --- /dev/null +++ b/website/static/css/featherlight-styles.css @@ -0,0 +1,120 @@ +/** + * Featherlight - ultra slim jQuery lightbox + * Version 1.7.14 - http://noelboss.github.io/featherlight/ + * + * Copyright 2019, Noël Raoul Bossart (http://www.noelboss.com) + * MIT Licensed. + **/ +html.with-featherlight { + overflow: hidden +} + +.featherlight { + display: none; + position: fixed; + top: 0; + right: 0; + bottom: 0; + left: 0; + z-index: 2147483647; + text-align: center; + white-space: nowrap; + cursor: pointer; + background: #333; + background: rgba(0, 0, 0, 0) +} + +.featherlight:last-of-type { + background: rgba(0, 0, 0, .8) +} + +.featherlight:before { + content: ''; + display: inline-block; + height: 100%; + vertical-align: middle +} + +.featherlight .featherlight-content { + position: relative; + text-align: left; + vertical-align: middle; + display: inline-block; + overflow: auto; + padding: 25px 25px 0; + border-bottom: 25px solid transparent; + margin-left: 5%; + margin-right: 5%; + max-height: 95%; + background: #fff; + cursor: auto; + white-space: normal +} + +.featherlight .featherlight-inner { + display: block +} + +.featherlight link.featherlight-inner, +.featherlight script.featherlight-inner, +.featherlight style.featherlight-inner { + display: none +} + +.featherlight .featherlight-close-icon { + position: absolute; + z-index: 9999; + top: 0; + right: 0; + line-height: 25px; + width: 25px; + cursor: pointer; + text-align: center; + font-family: Arial, sans-serif; + background: #fff; + background: rgba(255, 255, 255, .3); + color: #000; + border: 0; + padding: 0 +} + +.featherlight .featherlight-close-icon::-moz-focus-inner { + border: 0; + padding: 0 +} + +.featherlight .featherlight-image { + width: 100% +} + +.featherlight-iframe .featherlight-content { + border-bottom: 0; + padding: 0; + -webkit-overflow-scrolling: touch +} + +.featherlight iframe { + border: 0 +} + +.featherlight * { + -webkit-box-sizing: border-box; + -moz-box-sizing: border-box; + box-sizing: border-box +} + +@media only screen and (max-width:1024px) { + .featherlight .featherlight-content { + margin-left: 0; + margin-right: 0; + max-height: 98%; + padding: 10px 10px 0; + border-bottom: 10px solid transparent + } +} + +@media print { + html.with-featherlight>*>:not(.featherlight) { + display: none + } +} diff --git a/website/static/feeds/.gitkeep b/website/static/feeds/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/website/static/fonts/burlingame/dbt_cloud-04112018/demo-async.htm b/website/static/fonts/burlingame/dbt_cloud-04112018/demo-async.htm index 4c4d07b1096..03203216493 100755 --- a/website/static/fonts/burlingame/dbt_cloud-04112018/demo-async.htm +++ b/website/static/fonts/burlingame/dbt_cloud-04112018/demo-async.htm @@ -109,15 +109,15 @@ src:url("Fonts/ce94fd84-1e91-423c-9076-29caef2afc09.eot?#iefix") format("eot"),url("Fonts/dc5bb64c-62db-456c-b10e-b7b75e038082.woff2") format("woff2"),url("Fonts/5638d3db-0cad-4c92-8deb-ffadeb71c3b7.woff") format("woff"),url("Fonts/2e9aefc3-4ae8-4aca-a2a8-9d2a467cf09f.ttf") format("truetype"); } - @@ -128,24 +128,24 @@

          Fonts.com Web fonts


          The fonts contained in this kit are:

          -
          Burlingame® W01 Thin
          -
          Burlingame® W01 Thin Italic
          -
          Burlingame® W01 Light
          -
          Burlingame® W01 Light Italic
          -
          Burlingame® W01 Regular
          -
          Burlingame® W01 Italic
          -
          Burlingame® W01 Medium
          -
          Burlingame® W01 Medium Italic
          -
          Burlingame® W01 Semi Bold
          -
          Burlingame® W01 Semi Bold Italic
          -
          Burlingame® W01 Bold
          -
          Burlingame® W01 Bold Italic
          -
          Burlingame® W01 Extra Bold
          -
          Burlingame® W01 Extra Bold Italic
          -
          Burlingame® W01 Black
          -
          Burlingame® W01 Black Italic
          -
          Burlingame® W01 Extra Black
          -
          Burlingame® W01 Extra Black Italic
          +
          Burlingame® W01 Thin
          +
          Burlingame® W01 Thin Italic
          +
          Burlingame® W01 Light
          +
          Burlingame® W01 Light Italic
          +
          Burlingame® W01 Regular
          +
          Burlingame® W01 Italic
          +
          Burlingame® W01 Medium
          +
          Burlingame® W01 Medium Italic
          +
          Burlingame® W01 Semi Bold
          +
          Burlingame® W01 Semi Bold Italic
          +
          Burlingame® W01 Bold
          +
          Burlingame® W01 Bold Italic
          +
          Burlingame® W01 Extra Bold
          +
          Burlingame® W01 Extra Bold Italic
          +
          Burlingame® W01 Black
          +
          Burlingame® W01 Black Italic
          +
          Burlingame® W01 Extra Black
          +
          Burlingame® W01 Extra Black Italic

          @@ -154,7 +154,7 @@

          Fonts.com Web fonts

          CSS Implementation Guidelines

          You and any third party web font hosting service are responsible for ensuring that the font software in the self-hosting kit, in its original format, can only be used on the Web Sites for which the self-hosting kit was downloaded and cannot be used or referenced by any other web site. This includes, but is not limited to installing adequate technical protection measures that restrict the use and/or access to the font software, for instance by utilizing JavaScript or access control mechanism for cross-origin resource sharing and protecting against use on web sites other than the Web Sites for which the self-hosting kit was downloaded by restricting domain access only to such Web Sites. You must also retain the pageview tracking code on any Web Site that you self-host. In the event this Agreement terminates for any reason, the font software included with the self-hosting kit must be deleted from the server and all copies must be destroyed or returned to Monotype Imaging.

          -

          View "WEB FONT SOFTWARE" LICENSE AGREEMENT

          +

          View "WEB FONT SOFTWARE" LICENSE AGREEMENT

          Asynchronous Implementation (Requires JavaScript)

          Font file names have been obfuscated to protect the font software. You can identify font format based on file ending:

            @@ -259,15 +259,15 @@

            Asynchronous Implementation (Requires JavaScript)

            src:url("Fonts/ce94fd84-1e91-423c-9076-29caef2afc09.eot?#iefix") format("eot"),url("Fonts/dc5bb64c-62db-456c-b10e-b7b75e038082.woff2") format("woff2"),url("Fonts/5638d3db-0cad-4c92-8deb-ffadeb71c3b7.woff") format("woff"),url("Fonts/2e9aefc3-4ae8-4aca-a2a8-9d2a467cf09f.ttf") format("truetype"); } -

            Licensing information should be included within the CSS

            /*
            diff --git a/website/static/fonts/burlingame/dbt_cloud-04112018/demo.htm b/website/static/fonts/burlingame/dbt_cloud-04112018/demo.htm
            index f2b4a4d7068..875a3a3d814 100755
            --- a/website/static/fonts/burlingame/dbt_cloud-04112018/demo.htm
            +++ b/website/static/fonts/burlingame/dbt_cloud-04112018/demo.htm
            @@ -1,25 +1,25 @@
            -
            -
            -
            -    Webfonts Demo
            -    
            -    
            -    
            +    
            -
            -
            -    
            -

            Fonts.com Web fonts

            -

            @font-face implementation instructions

            -
            -

            The fonts contained in this kit are:

            -
            -
            Burlingame® W01 Thin
            -
            Burlingame® W01 Thin Italic
            -
            Burlingame® W01 Light
            -
            Burlingame® W01 Light Italic
            -
            Burlingame® W01 Regular
            -
            Burlingame® W01 Italic
            -
            Burlingame® W01 Medium
            -
            Burlingame® W01 Medium Italic
            -
            Burlingame® W01 Semi Bold
            -
            Burlingame® W01 Semi Bold Italic
            -
            Burlingame® W01 Bold
            -
            Burlingame® W01 Bold Italic
            -
            Burlingame® W01 Extra Bold
            -
            Burlingame® W01 Extra Bold Italic
            -
            Burlingame® W01 Black
            -
            Burlingame® W01 Black Italic
            -
            Burlingame® W01 Extra Black
            -
            Burlingame® W01 Extra Black Italic
            - -
            -
            - Click here for asynchronous web fonts implementation -
            -
            -

            CSS Implementation Guidelines

            -

            You and any third party web font hosting service are responsible for ensuring that the font software in the self-hosting kit, in its original format, can only be used on the Web Sites for which the self-hosting kit was downloaded and cannot be used or referenced by any other web site. This includes, but is not limited to installing adequate technical protection measures that restrict the use and/or access to the font software, for instance by utilizing JavaScript or access control mechanism for cross-origin resource sharing and protecting against use on web sites other than the Web Sites for which the self-hosting kit was downloaded by restricting domain access only to such Web Sites. You must also retain the pageview tracking code on any Web Site that you self-host. In the event this Agreement terminates for any reason, the font software included with the self-hosting kit must be deleted from the server and all copies must be destroyed or returned to Monotype Imaging.

            -

            View "WEB FONT SOFTWARE" LICENSE AGREEMENT

            -

            Font file names have been obfuscated to protect the font software. You can identify font format based on file ending:

            -
              -
            • 1 - TrueType (ttf)
            • -
            • 2 - Embedded OpenType (eot)
            • -
            • 3 - Web Open Font Format (woff)
            • -
            • 4 - Web Open Font Format version 2 (woff2)
            • -
            • 5 - Scalable Vector Graphics (svg)
            • -
            -

            Copy and paste following code in head section of the page

            -
            - -

            Licensing information should be included within the CSS

            -
            /*
            -This CSS resource incorporates links to font software which is the valuable copyrighted
            -property of Monotype Imaging and/or its suppliers. You may not attempt to copy, install,
            -redistribute, convert, modify or reverse engineer this font software. Please contact Monotype
            -Imaging with any questions regarding Web Fonts:  http://www.fonts.com
            -*/
            -
            -
            - - + + + + +

            Licensing information should be included within the CSS

            +
            /*
            +This CSS resource incorporates links to font software which is the valuable copyrighted
            +property of Monotype Imaging and/or its suppliers. You may not attempt to copy, install,
            +redistribute, convert, modify or reverse engineer this font software. Please contact Monotype
            +Imaging with any questions regarding Web Fonts:  http://www.fonts.com
            +*/
            +
          +
          + + diff --git a/website/static/img/Filtering.png b/website/static/img/Filtering.png new file mode 100644 index 00000000000..b05394bd459 Binary files /dev/null and b/website/static/img/Filtering.png differ diff --git a/website/static/img/Paginate.png b/website/static/img/Paginate.png new file mode 100644 index 00000000000..21e2fd138b8 Binary files /dev/null and b/website/static/img/Paginate.png differ diff --git a/website/static/img/api-access-profile.jpg b/website/static/img/api-access-profile.jpg new file mode 100644 index 00000000000..36ffd4beda8 Binary files /dev/null and b/website/static/img/api-access-profile.jpg differ diff --git a/website/static/img/api-access-profile.png b/website/static/img/api-access-profile.png deleted file mode 100644 index deade9f2135..00000000000 Binary files a/website/static/img/api-access-profile.png and /dev/null differ diff --git a/website/static/img/blog/2021-11-22-sql-surrogate-keys/surrogate_key_comparison.png b/website/static/img/blog/2021-11-22-sql-surrogate-keys/surrogate_key_comparison.png new file mode 100644 index 00000000000..6666a6e3a3c Binary files /dev/null and b/website/static/img/blog/2021-11-22-sql-surrogate-keys/surrogate_key_comparison.png differ diff --git a/website/static/img/blog/2022-07-26-pre-commit-dbt/pre-commit-run-all-files.gif b/website/static/img/blog/2022-07-26-pre-commit-dbt/pre-commit-run-all-files.gif new file mode 100644 index 00000000000..a6cc2b99f58 Binary files /dev/null and b/website/static/img/blog/2022-07-26-pre-commit-dbt/pre-commit-run-all-files.gif differ diff --git a/website/static/img/blog/2023-01-17-grouping-data-tests/1-monotonicity.png b/website/static/img/blog/2023-01-17-grouping-data-tests/1-monotonicity.png new file mode 100644 index 00000000000..5dc9136fe73 Binary files /dev/null and b/website/static/img/blog/2023-01-17-grouping-data-tests/1-monotonicity.png differ diff --git a/website/static/img/blog/2023-01-17-grouping-data-tests/2-missing.png b/website/static/img/blog/2023-01-17-grouping-data-tests/2-missing.png new file mode 100644 index 00000000000..0016667c079 Binary files /dev/null and b/website/static/img/blog/2023-01-17-grouping-data-tests/2-missing.png differ diff --git a/website/static/img/blog/2023-01-27-autoscaling-ci/01-yolo-prod.png b/website/static/img/blog/2023-01-27-autoscaling-ci/01-yolo-prod.png new file mode 100644 index 00000000000..39b03ab4e26 Binary files /dev/null and b/website/static/img/blog/2023-01-27-autoscaling-ci/01-yolo-prod.png differ diff --git a/website/static/img/blog/2023-02-01-ingestion-time-partitioning-bigquery/merge-vs-select.png b/website/static/img/blog/2023-02-01-ingestion-time-partitioning-bigquery/merge-vs-select.png new file mode 100644 index 00000000000..3935f94f7c9 Binary files /dev/null and b/website/static/img/blog/2023-02-01-ingestion-time-partitioning-bigquery/merge-vs-select.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image1.png b/website/static/img/blog/2023-03-23-audit-helper/image1.png new file mode 100644 index 00000000000..aa81eca82e3 Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image1.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image2.png b/website/static/img/blog/2023-03-23-audit-helper/image2.png new file mode 100644 index 00000000000..58cf5c1b917 Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image2.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image3.png b/website/static/img/blog/2023-03-23-audit-helper/image3.png new file mode 100644 index 00000000000..cb8744879a0 Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image3.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image4.png b/website/static/img/blog/2023-03-23-audit-helper/image4.png new file mode 100644 index 00000000000..b994960199f Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image4.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image5.png b/website/static/img/blog/2023-03-23-audit-helper/image5.png new file mode 100644 index 00000000000..c322387604b Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image5.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image6.png b/website/static/img/blog/2023-03-23-audit-helper/image6.png new file mode 100644 index 00000000000..1a8937d07ff Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image6.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image7.png b/website/static/img/blog/2023-03-23-audit-helper/image7.png new file mode 100644 index 00000000000..03773d582f2 Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image7.png differ diff --git a/website/static/img/blog/2023-03-23-audit-helper/image8.png b/website/static/img/blog/2023-03-23-audit-helper/image8.png new file mode 100644 index 00000000000..ea5e2454812 Binary files /dev/null and b/website/static/img/blog/2023-03-23-audit-helper/image8.png differ diff --git a/website/static/img/blog/2023-04-17-dbt-squared/roche-db-diagram.png b/website/static/img/blog/2023-04-17-dbt-squared/roche-db-diagram.png new file mode 100644 index 00000000000..9bd9d4766a6 Binary files /dev/null and b/website/static/img/blog/2023-04-17-dbt-squared/roche-db-diagram.png differ diff --git a/website/static/img/blog/2023-04-17-dbt-squared/roche-meme.png b/website/static/img/blog/2023-04-17-dbt-squared/roche-meme.png new file mode 100644 index 00000000000..48b189caee4 Binary files /dev/null and b/website/static/img/blog/2023-04-17-dbt-squared/roche-meme.png differ diff --git a/website/static/img/blog/2023-04-17-dbt-squared/roche-project-tree.png b/website/static/img/blog/2023-04-17-dbt-squared/roche-project-tree.png new file mode 100644 index 00000000000..2cfb608e556 Binary files /dev/null and b/website/static/img/blog/2023-04-17-dbt-squared/roche-project-tree.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/3nf-to-dimensional-model.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/3nf-to-dimensional-model.png new file mode 100644 index 00000000000..541320547a4 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/3nf-to-dimensional-model.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/conversation.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/conversation.png new file mode 100644 index 00000000000..06a9505b185 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/conversation.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/data-modelling.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/data-modelling.png new file mode 100644 index 00000000000..61641ddc484 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/data-modelling.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dbt-dag.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dbt-dag.png new file mode 100644 index 00000000000..35977760327 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dbt-dag.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dimension-tables.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dimension-tables.png new file mode 100644 index 00000000000..575324842f7 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/dimension-tables.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/fct_sales.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/fct_sales.png new file mode 100644 index 00000000000..d0a4443f956 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/fct_sales.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/sales-order-header-detail.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/sales-order-header-detail.png new file mode 100644 index 00000000000..49a5f61eaea Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/sales-order-header-detail.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/snowflake-schema.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/snowflake-schema.png new file mode 100644 index 00000000000..dfa420bcc0a Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/snowflake-schema.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/source-schema.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/source-schema.png new file mode 100644 index 00000000000..b86831a00a4 Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/source-schema.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/star-schema.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/star-schema.png new file mode 100644 index 00000000000..a23ae73507a Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/star-schema.png differ diff --git a/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/target-schema.png b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/target-schema.png new file mode 100644 index 00000000000..61e0bfc535e Binary files /dev/null and b/website/static/img/blog/2023-04-18-building-a-kimball-dimensional-model-with-dbt/target-schema.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure1.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure1.png new file mode 100644 index 00000000000..400b78e0f08 Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure1.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure3.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure3.png new file mode 100644 index 00000000000..bee393a3e2b Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure3.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure4.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure4.png new file mode 100644 index 00000000000..035ee81f760 Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure4.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure5.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure5.png new file mode 100644 index 00000000000..7c2c8439c22 Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure5.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure6.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure6.png new file mode 100644 index 00000000000..d6b9c33f231 Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure6.png differ diff --git a/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure7.png b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure7.png new file mode 100644 index 00000000000..937749ac149 Binary files /dev/null and b/website/static/img/blog/2023-04-24-framework-refactor-alteryx-dbt/Figure7.png differ diff --git a/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png b/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png new file mode 100644 index 00000000000..88316f363f2 Binary files /dev/null and b/website/static/img/blog/2023-05-02-modeling-ragged-time-varying-hierarchies/hierarchy.png differ diff --git a/website/static/img/blog/2023-05-04-generating-dynamic-docs/1.png b/website/static/img/blog/2023-05-04-generating-dynamic-docs/1.png new file mode 100644 index 00000000000..1274ec54d6f Binary files /dev/null and b/website/static/img/blog/2023-05-04-generating-dynamic-docs/1.png differ diff --git a/website/static/img/blog/2023-05-04-generating-dynamic-docs/2.jpg b/website/static/img/blog/2023-05-04-generating-dynamic-docs/2.jpg new file mode 100644 index 00000000000..d0edf990bf7 Binary files /dev/null and b/website/static/img/blog/2023-05-04-generating-dynamic-docs/2.jpg differ diff --git a/website/static/img/blog/2023-05-04-generating-dynamic-docs/3.png b/website/static/img/blog/2023-05-04-generating-dynamic-docs/3.png new file mode 100644 index 00000000000..46fda3f2672 Binary files /dev/null and b/website/static/img/blog/2023-05-04-generating-dynamic-docs/3.png differ diff --git a/website/static/img/blog/2023-05-04-generating-dynamic-docs/4.png b/website/static/img/blog/2023-05-04-generating-dynamic-docs/4.png new file mode 100644 index 00000000000..f412e6a20b5 Binary files /dev/null and b/website/static/img/blog/2023-05-04-generating-dynamic-docs/4.png differ diff --git a/website/static/img/blog/2023-05-04-generating-dynamic-docs/5.jpg b/website/static/img/blog/2023-05-04-generating-dynamic-docs/5.jpg new file mode 100644 index 00000000000..6d414a555a4 Binary files /dev/null and b/website/static/img/blog/2023-05-04-generating-dynamic-docs/5.jpg differ diff --git a/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-models-dependency-graph.png b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-models-dependency-graph.png new file mode 100644 index 00000000000..991cc89564d Binary files /dev/null and b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-models-dependency-graph.png differ diff --git a/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segmentation-matrix.png b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segmentation-matrix.png new file mode 100644 index 00000000000..4ff7462b8e4 Binary files /dev/null and b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segmentation-matrix.png differ diff --git a/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segments-example.png b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segments-example.png new file mode 100644 index 00000000000..7a63668cf0d Binary files /dev/null and b/website/static/img/blog/2023-05-08-building-a-historical-user-segmentation-model-with-dbt/rfm-segments-example.png differ diff --git a/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/data-dungeon-meme.jpeg b/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/data-dungeon-meme.jpeg new file mode 100644 index 00000000000..cbedf5014d5 Binary files /dev/null and b/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/data-dungeon-meme.jpeg differ diff --git a/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/reservoir-dam-hallucination.png b/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/reservoir-dam-hallucination.png new file mode 100644 index 00000000000..2a37cc567b2 Binary files /dev/null and b/website/static/img/blog/2023-07-03-data-vault-2-0-with-dbt-cloud/reservoir-dam-hallucination.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png new file mode 100644 index 00000000000..687bdef7568 Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image1.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png new file mode 100644 index 00000000000..658e4c0cfb5 Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image2.png differ diff --git a/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png new file mode 100644 index 00000000000..fa4b837a82f Binary files /dev/null and b/website/static/img/blog/2023-07-17-GPT-and-dbt-test/image3.png differ diff --git a/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg b/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg new file mode 100644 index 00000000000..bd72dba3663 Binary files /dev/null and b/website/static/img/blog/2023-08-01-announcing-materialized-views/materialized-incremental-twins.jpg differ diff --git a/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png b/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png new file mode 100644 index 00000000000..70baf10473e Binary files /dev/null and b/website/static/img/blog/2023-08-01-announcing-materialized-views/streaming-pipeline.png differ diff --git a/website/static/img/blog/authors/arthur-marcon.png b/website/static/img/blog/authors/arthur-marcon.png new file mode 100644 index 00000000000..fd0378475b1 Binary files /dev/null and b/website/static/img/blog/authors/arthur-marcon.png differ diff --git a/website/static/img/blog/authors/callie-white.jpg b/website/static/img/blog/authors/callie-white.jpg new file mode 100644 index 00000000000..e235016c926 Binary files /dev/null and b/website/static/img/blog/authors/callie-white.jpg differ diff --git a/website/static/img/blog/authors/christian-van-bellen.JPG b/website/static/img/blog/authors/christian-van-bellen.JPG new file mode 100644 index 00000000000..c5d598f2945 Binary files /dev/null and b/website/static/img/blog/authors/christian-van-bellen.JPG differ diff --git a/website/static/img/blog/authors/christophe-oudar.png b/website/static/img/blog/authors/christophe-oudar.png new file mode 100644 index 00000000000..cdec9b0537c Binary files /dev/null and b/website/static/img/blog/authors/christophe-oudar.png differ diff --git a/website/static/img/blog/authors/dguthrie.jpeg b/website/static/img/blog/authors/dguthrie.jpeg new file mode 100644 index 00000000000..3e6c107cc41 Binary files /dev/null and b/website/static/img/blog/authors/dguthrie.jpeg differ diff --git a/website/static/img/blog/authors/emily-riederer.jpeg b/website/static/img/blog/authors/emily-riederer.jpeg new file mode 100644 index 00000000000..49d2e15946a Binary files /dev/null and b/website/static/img/blog/authors/emily-riederer.jpeg differ diff --git a/website/static/img/blog/authors/jade-milaney.jpg b/website/static/img/blog/authors/jade-milaney.jpg new file mode 100644 index 00000000000..083dfd3f9a0 Binary files /dev/null and b/website/static/img/blog/authors/jade-milaney.jpg differ diff --git a/website/static/img/blog/authors/joao_antunes.jpg b/website/static/img/blog/authors/joao_antunes.jpg new file mode 100644 index 00000000000..a1dc5aa3cc1 Binary files /dev/null and b/website/static/img/blog/authors/joao_antunes.jpg differ diff --git a/website/static/img/blog/authors/jonathan-neo.png b/website/static/img/blog/authors/jonathan-neo.png new file mode 100644 index 00000000000..ab6849c21cc Binary files /dev/null and b/website/static/img/blog/authors/jonathan-neo.png differ diff --git a/website/static/img/blog/authors/lucas-dias.jpg b/website/static/img/blog/authors/lucas-dias.jpg new file mode 100644 index 00000000000..359caf7d325 Binary files /dev/null and b/website/static/img/blog/authors/lucas-dias.jpg differ diff --git a/website/static/img/blog/authors/mikael-thorup.jpeg b/website/static/img/blog/authors/mikael-thorup.jpeg new file mode 100644 index 00000000000..fe62f039028 Binary files /dev/null and b/website/static/img/blog/authors/mikael-thorup.jpeg differ diff --git a/website/static/img/blog/authors/noah-kennedy.png b/website/static/img/blog/authors/noah-kennedy.png new file mode 100644 index 00000000000..e9090c0238d Binary files /dev/null and b/website/static/img/blog/authors/noah-kennedy.png differ diff --git a/website/static/img/blog/authors/pedro_brito.jpeg b/website/static/img/blog/authors/pedro_brito.jpeg new file mode 100644 index 00000000000..9f163a431f3 Binary files /dev/null and b/website/static/img/blog/authors/pedro_brito.jpeg differ diff --git a/website/static/img/blog/authors/rastislav-zdechovan.png b/website/static/img/blog/authors/rastislav-zdechovan.png new file mode 100644 index 00000000000..40f8151d620 Binary files /dev/null and b/website/static/img/blog/authors/rastislav-zdechovan.png differ diff --git a/website/static/img/blog/authors/sam-harting.png b/website/static/img/blog/authors/sam-harting.png new file mode 100644 index 00000000000..eeec9f3c9aa Binary files /dev/null and b/website/static/img/blog/authors/sam-harting.png differ diff --git a/website/static/img/blog/authors/santiago-jauregui.jpeg b/website/static/img/blog/authors/santiago-jauregui.jpeg new file mode 100644 index 00000000000..dcc5fbeb16b Binary files /dev/null and b/website/static/img/blog/authors/santiago-jauregui.jpeg differ diff --git a/website/static/img/blog/authors/sean_mcintyre.jpg b/website/static/img/blog/authors/sean_mcintyre.jpg new file mode 100644 index 00000000000..be5ff5bf2b0 Binary files /dev/null and b/website/static/img/blog/authors/sean_mcintyre.jpg differ diff --git a/website/static/img/blog/authors/sterling-paramore.png b/website/static/img/blog/authors/sterling-paramore.png new file mode 100644 index 00000000000..488bade8abd Binary files /dev/null and b/website/static/img/blog/authors/sterling-paramore.png differ diff --git a/website/static/img/blog/authors/yannick_misteli.jpg b/website/static/img/blog/authors/yannick_misteli.jpg new file mode 100644 index 00000000000..7b1c46fb33f Binary files /dev/null and b/website/static/img/blog/authors/yannick_misteli.jpg differ diff --git a/website/static/img/codespace-quickstart/postCreateCommand.png b/website/static/img/codespace-quickstart/postCreateCommand.png new file mode 100644 index 00000000000..87783c23590 Binary files /dev/null and b/website/static/img/codespace-quickstart/postCreateCommand.png differ diff --git a/website/static/img/command-line.png b/website/static/img/command-line.png new file mode 100644 index 00000000000..df0dd28e61d Binary files /dev/null and b/website/static/img/command-line.png differ diff --git a/website/static/img/community/spotlight/alan-cruickshank.jpg b/website/static/img/community/spotlight/alan-cruickshank.jpg new file mode 100644 index 00000000000..bafc53aeb9d Binary files /dev/null and b/website/static/img/community/spotlight/alan-cruickshank.jpg differ diff --git a/website/static/img/community/spotlight/anya-prosvetova.jpg b/website/static/img/community/spotlight/anya-prosvetova.jpg new file mode 100644 index 00000000000..d87edd8d63b Binary files /dev/null and b/website/static/img/community/spotlight/anya-prosvetova.jpg differ diff --git a/website/static/img/community/spotlight/bruno-de-lima.jpg b/website/static/img/community/spotlight/bruno-de-lima.jpg new file mode 100644 index 00000000000..3124329401c Binary files /dev/null and b/website/static/img/community/spotlight/bruno-de-lima.jpg differ diff --git a/website/static/img/community/spotlight/david-effiong.jpg b/website/static/img/community/spotlight/david-effiong.jpg new file mode 100644 index 00000000000..358a1994fda Binary files /dev/null and b/website/static/img/community/spotlight/david-effiong.jpg differ diff --git a/website/static/img/community/spotlight/emily-riederer.jpg b/website/static/img/community/spotlight/emily-riederer.jpg new file mode 100644 index 00000000000..8289bb01789 Binary files /dev/null and b/website/static/img/community/spotlight/emily-riederer.jpg differ diff --git a/website/static/img/community/spotlight/fabiyi-opeyemi.jpg b/website/static/img/community/spotlight/fabiyi-opeyemi.jpg new file mode 100644 index 00000000000..f1ac40dfa6d Binary files /dev/null and b/website/static/img/community/spotlight/fabiyi-opeyemi.jpg differ diff --git a/website/static/img/community/spotlight/faith-lierheimer.jpg b/website/static/img/community/spotlight/faith-lierheimer.jpg new file mode 100644 index 00000000000..5ec1dc39719 Binary files /dev/null and b/website/static/img/community/spotlight/faith-lierheimer.jpg differ diff --git a/website/static/img/community/spotlight/jing-lim.jpg b/website/static/img/community/spotlight/jing-lim.jpg new file mode 100644 index 00000000000..7f7964d3bc6 Binary files /dev/null and b/website/static/img/community/spotlight/jing-lim.jpg differ diff --git a/website/static/img/community/spotlight/josh-devlin.jpg b/website/static/img/community/spotlight/josh-devlin.jpg new file mode 100644 index 00000000000..58e2b4da854 Binary files /dev/null and b/website/static/img/community/spotlight/josh-devlin.jpg differ diff --git a/website/static/img/community/spotlight/karen-hsieh.jpg b/website/static/img/community/spotlight/karen-hsieh.jpg new file mode 100644 index 00000000000..2af8a1d00bb Binary files /dev/null and b/website/static/img/community/spotlight/karen-hsieh.jpg differ diff --git a/website/static/img/community/spotlight/owen-prough.jpg b/website/static/img/community/spotlight/owen-prough.jpg new file mode 100644 index 00000000000..83e8c82cef5 Binary files /dev/null and b/website/static/img/community/spotlight/owen-prough.jpg differ diff --git a/website/static/img/community/spotlight/placeholder-img-1.jpg b/website/static/img/community/spotlight/placeholder-img-1.jpg new file mode 100644 index 00000000000..4e3b8fd01a1 Binary files /dev/null and b/website/static/img/community/spotlight/placeholder-img-1.jpg differ diff --git a/website/static/img/community/spotlight/shinya-takimoto.jpg b/website/static/img/community/spotlight/shinya-takimoto.jpg new file mode 100644 index 00000000000..da9b6925279 Binary files /dev/null and b/website/static/img/community/spotlight/shinya-takimoto.jpg differ diff --git a/website/static/img/databricks_tutorial/images/create_table_using_databricks_SQL.png b/website/static/img/databricks_tutorial/images/create_table_using_databricks_SQL.png deleted file mode 100644 index c7811ba80c8..00000000000 Binary files a/website/static/img/databricks_tutorial/images/create_table_using_databricks_SQL.png and /dev/null differ diff --git a/website/static/img/databricks_tutorial/images/new_file_upload_using_databricks_SQL.png b/website/static/img/databricks_tutorial/images/new_file_upload_using_databricks_SQL.png new file mode 100644 index 00000000000..eda66f18f73 Binary files /dev/null and b/website/static/img/databricks_tutorial/images/new_file_upload_using_databricks_SQL.png differ diff --git a/website/static/img/dbt-cloud-project-setup-flow-next.png b/website/static/img/dbt-cloud-project-setup-flow-next.png index 660e8ae446a..92f46bccd0a 100644 Binary files a/website/static/img/dbt-cloud-project-setup-flow-next.png and b/website/static/img/dbt-cloud-project-setup-flow-next.png differ diff --git a/website/static/img/delete_projects_from_dbt_cloud_20221023.gif b/website/static/img/delete_projects_from_dbt_cloud_20221023.gif new file mode 100644 index 00000000000..b579556d457 Binary files /dev/null and b/website/static/img/delete_projects_from_dbt_cloud_20221023.gif differ diff --git a/website/static/img/docs/building-a-dbt-project/MetricFlow-SchemaExample.jpeg b/website/static/img/docs/building-a-dbt-project/MetricFlow-SchemaExample.jpeg new file mode 100644 index 00000000000..9b0f0181b76 Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/MetricFlow-SchemaExample.jpeg differ diff --git a/website/static/img/docs/building-a-dbt-project/dag-exposures.png b/website/static/img/docs/building-a-dbt-project/dag-exposures.png index d086b6bdfd1..d5649440c97 100644 Binary files a/website/static/img/docs/building-a-dbt-project/dag-exposures.png and b/website/static/img/docs/building-a-dbt-project/dag-exposures.png differ diff --git a/website/static/img/docs/building-a-dbt-project/dbt-docs-exposures.png b/website/static/img/docs/building-a-dbt-project/dbt-docs-exposures.png index 6efc1d39a22..25a00af109f 100644 Binary files a/website/static/img/docs/building-a-dbt-project/dbt-docs-exposures.png and b/website/static/img/docs/building-a-dbt-project/dbt-docs-exposures.png differ diff --git a/website/static/img/docs/building-a-dbt-project/multihop-diagram.png b/website/static/img/docs/building-a-dbt-project/multihop-diagram.png new file mode 100644 index 00000000000..b6df1c12c03 Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/multihop-diagram.png differ diff --git a/website/static/img/docs/building-a-dbt-project/project-subdirectory.jpg b/website/static/img/docs/building-a-dbt-project/project-subdirectory.jpg new file mode 100644 index 00000000000..d68e6f928fe Binary files /dev/null and b/website/static/img/docs/building-a-dbt-project/project-subdirectory.jpg differ diff --git a/website/static/img/docs/collaborate/bigquery-deploy-env-deploy-credentials.png b/website/static/img/docs/collaborate/bigquery-deploy-env-deploy-credentials.png new file mode 100644 index 00000000000..5d730ad908d Binary files /dev/null and b/website/static/img/docs/collaborate/bigquery-deploy-env-deploy-credentials.png differ diff --git a/website/static/img/docs/collaborate/databricks-deploy-env-deploy-connection.png b/website/static/img/docs/collaborate/databricks-deploy-env-deploy-connection.png new file mode 100644 index 00000000000..00ad5e52e9b Binary files /dev/null and b/website/static/img/docs/collaborate/databricks-deploy-env-deploy-connection.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif new file mode 100644 index 00000000000..458aa8e874d Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/catalog-sidebar-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-child.png b/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-child.png new file mode 100644 index 00000000000..666db3384fa Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-child.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-parent.png b/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-parent.png new file mode 100644 index 00000000000..ee5d19de369 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/cross-project-lineage-parent.png differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif new file mode 100644 index 00000000000..2772eaa9619 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/lineage-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif new file mode 100644 index 00000000000..24c8312af11 Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/model-resource-details-v1.gif differ diff --git a/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif b/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif new file mode 100644 index 00000000000..1343f58171d Binary files /dev/null and b/website/static/img/docs/collaborate/dbt-explorer/search-v1.gif differ diff --git a/website/static/img/docs/collaborate/postgres-deploy-env-deploy-credentials.png b/website/static/img/docs/collaborate/postgres-deploy-env-deploy-credentials.png new file mode 100644 index 00000000000..48c87ca69b0 Binary files /dev/null and b/website/static/img/docs/collaborate/postgres-deploy-env-deploy-credentials.png differ diff --git a/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-connection.png b/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-connection.png new file mode 100644 index 00000000000..0009790aed6 Binary files /dev/null and b/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-connection.png differ diff --git a/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-credentials.png b/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-credentials.png new file mode 100644 index 00000000000..587cd8de2c2 Binary files /dev/null and b/website/static/img/docs/collaborate/snowflake-deploy-env-deploy-credentials.png differ diff --git a/website/static/img/docs/collaborate/spark-deploy-env-deploy-credentials.png b/website/static/img/docs/collaborate/spark-deploy-env-deploy-credentials.png new file mode 100644 index 00000000000..851f851f7a4 Binary files /dev/null and b/website/static/img/docs/collaborate/spark-deploy-env-deploy-credentials.png differ diff --git a/website/static/img/docs/dbt-cloud/Confirm Delete.png b/website/static/img/docs/dbt-cloud/Confirm Delete.png new file mode 100644 index 00000000000..b4ff85422d4 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/Confirm Delete.png differ diff --git a/website/static/img/docs/dbt-cloud/Edit The Project.png b/website/static/img/docs/dbt-cloud/Edit The Project.png new file mode 100644 index 00000000000..5bb1eed2d07 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/Edit The Project.png differ diff --git a/website/static/img/docs/dbt-cloud/Navigate To Account Settings.png b/website/static/img/docs/dbt-cloud/Navigate To Account Settings.png new file mode 100644 index 00000000000..74cd23323e7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/Navigate To Account Settings.png differ diff --git a/website/static/img/docs/dbt-cloud/Select A Project.png b/website/static/img/docs/dbt-cloud/Select A Project.png new file mode 100644 index 00000000000..54cc7942d6f Binary files /dev/null and b/website/static/img/docs/dbt-cloud/Select A Project.png differ diff --git a/website/static/img/docs/dbt-cloud/Select Delete A Project.png b/website/static/img/docs/dbt-cloud/Select Delete A Project.png new file mode 100644 index 00000000000..2ee8af68ec7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/Select Delete A Project.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/aad-app-registration.png b/website/static/img/docs/dbt-cloud/access-control/aad-app-registration.png new file mode 100644 index 00000000000..61ef8851f4e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/aad-app-registration.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/app-overview.png b/website/static/img/docs/dbt-cloud/access-control/app-overview.png new file mode 100644 index 00000000000..ee3624ba65c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/app-overview.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/azure-enable.png b/website/static/img/docs/dbt-cloud/access-control/azure-enable.png new file mode 100644 index 00000000000..c9e06a46b83 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/azure-enable.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/begin-migration.png b/website/static/img/docs/dbt-cloud/access-control/begin-migration.png new file mode 100644 index 00000000000..6b88006c523 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/begin-migration.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/cloud-sso-fields.png b/website/static/img/docs/dbt-cloud/access-control/cloud-sso-fields.png new file mode 100644 index 00000000000..cd448339bf1 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/cloud-sso-fields.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/enable-auth0.png b/website/static/img/docs/dbt-cloud/access-control/enable-auth0.png new file mode 100644 index 00000000000..a4850b48ac8 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/enable-auth0.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/google-cloud-sso.png b/website/static/img/docs/dbt-cloud/access-control/google-cloud-sso.png new file mode 100644 index 00000000000..21f59ed7184 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/google-cloud-sso.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/google-enable.png b/website/static/img/docs/dbt-cloud/access-control/google-enable.png new file mode 100644 index 00000000000..f87858ff751 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/google-enable.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/google-uri.png b/website/static/img/docs/dbt-cloud/access-control/google-uri.png new file mode 100644 index 00000000000..819836f0584 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/google-uri.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/new-okta-completed.png b/website/static/img/docs/dbt-cloud/access-control/new-okta-completed.png new file mode 100644 index 00000000000..021b60213e1 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/new-okta-completed.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/new-okta-config.png b/website/static/img/docs/dbt-cloud/access-control/new-okta-config.png new file mode 100644 index 00000000000..a256ef37a41 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/new-okta-config.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/redirect-URI.png b/website/static/img/docs/dbt-cloud/access-control/redirect-URI.png new file mode 100644 index 00000000000..158b62bdd32 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/redirect-URI.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/saml-enable.png b/website/static/img/docs/dbt-cloud/access-control/saml-enable.png new file mode 100644 index 00000000000..6a4d958a92e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/saml-enable.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/sso-migration-available.png b/website/static/img/docs/dbt-cloud/access-control/sso-migration-available.png new file mode 100644 index 00000000000..1d6bb513118 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/sso-migration-available.png differ diff --git a/website/static/img/docs/dbt-cloud/access-control/sso-project.png b/website/static/img/docs/dbt-cloud/access-control/sso-project.png new file mode 100644 index 00000000000..6769c2ada1c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/access-control/sso-project.png differ diff --git a/website/static/img/docs/dbt-cloud/change_user_to_read_only_20221023.gif b/website/static/img/docs/dbt-cloud/change_user_to_read_only_20221023.gif new file mode 100644 index 00000000000..4c4b81c15fa Binary files /dev/null and b/website/static/img/docs/dbt-cloud/change_user_to_read_only_20221023.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choose-a-connection.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choose-a-connection.png new file mode 100644 index 00000000000..cc8519a8577 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/choose-a-connection.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/confirm-delete.jpg b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/confirm-delete.jpg new file mode 100644 index 00000000000..36ace6f1e07 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/confirm-delete.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.jpg b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.jpg new file mode 100644 index 00000000000..851ef0b60d6 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/create-deploy-env.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-environment.jpg b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-environment.jpg new file mode 100644 index 00000000000..bf38c131a11 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-environment.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-job.jpg b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-job.jpg new file mode 100644 index 00000000000..e1db399a5de Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/delete-job.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/service-token-date.png b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/service-token-date.png new file mode 100644 index 00000000000..8f35eba639b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-configuring-dbt-cloud/service-token-date.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/change-branch-modal.png b/website/static/img/docs/dbt-cloud/cloud-ide/change-branch-modal.png new file mode 100644 index 00000000000..7ed19020f51 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/change-branch-modal.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-new.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-new.jpg new file mode 100644 index 00000000000..38db760e2f2 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-new.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-v2.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-v2.jpg new file mode 100644 index 00000000000..fbebf0c90c6 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/cloud-ide-v2.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes-modal.png b/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes-modal.png new file mode 100644 index 00000000000..0e2cfacd6af Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes-modal.png differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes.jpg new file mode 100644 index 00000000000..2f3d2cbd143 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/commit-changes.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/commit-resolve.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/commit-resolve.jpg new file mode 100644 index 00000000000..28027788c5b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/commit-resolve.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/commit-without-resolve.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/commit-without-resolve.jpg new file mode 100644 index 00000000000..2bef28221bd Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/commit-without-resolve.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/compiled-code-console-tab.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/compiled-code-console-tab.jpg new file mode 100644 index 00000000000..4bd298ad80d Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/compiled-code-console-tab.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/editing-components-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/editing-components-with-save.jpg new file mode 100644 index 00000000000..07be8cddb9c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/editing-components-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg new file mode 100644 index 00000000000..baeadf873fc Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/editor-tab-menu-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/gitignore-italics.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/gitignore-italics.jpg new file mode 100644 index 00000000000..b1cde629744 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/gitignore-italics.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg new file mode 100644 index 00000000000..7392783f2a5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-basic-layout.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-cmd-status.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-cmd-status.jpg new file mode 100644 index 00000000000..cdb77473dfa Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-cmd-status.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg new file mode 100644 index 00000000000..fe60ddd7e03 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-command-bar.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg new file mode 100644 index 00000000000..df2e808fed9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-console-overview.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-csv.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-csv.jpg new file mode 100644 index 00000000000..c3d53e2edd6 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-csv.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg new file mode 100644 index 00000000000..b2caf0e6acf Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editing.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg new file mode 100644 index 00000000000..2c72241aa8c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-editor-command-palette-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg new file mode 100644 index 00000000000..f8c03811350 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-file-search-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg new file mode 100644 index 00000000000..24eaca5d5c5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-git-diff-view-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg new file mode 100644 index 00000000000..49e9e82405c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-global-command-palette-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-inv-history-drawer.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-inv-history-drawer.jpg new file mode 100644 index 00000000000..e55658af182 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-inv-history-drawer.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-lint-format-console.gif b/website/static/img/docs/dbt-cloud/cloud-ide/ide-lint-format-console.gif new file mode 100644 index 00000000000..0679f702e03 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-lint-format-console.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-markdown-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-markdown-with-save.jpg new file mode 100644 index 00000000000..2514581e959 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-markdown-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg new file mode 100644 index 00000000000..c4167cf558a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-minimap.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg new file mode 100644 index 00000000000..6e26811bd50 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-options-menu-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-results.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-results.jpg new file mode 100644 index 00000000000..27af7192a27 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-results.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg new file mode 100644 index 00000000000..bab04d75a38 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-side-menu.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-sql-popup.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-sql-popup.jpg new file mode 100644 index 00000000000..02729004f9c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-sql-popup.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-sqlfluff-config.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-sqlfluff-config.jpg new file mode 100644 index 00000000000..977e36646ba Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-sqlfluff-config.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-status-modal-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-status-modal-with-save.jpg new file mode 100644 index 00000000000..5845d72ca90 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-status-modal-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/ide-unsaved-modal.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/ide-unsaved-modal.jpg new file mode 100644 index 00000000000..bbd1ec58d86 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/ide-unsaved-modal.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/invocation-components-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/invocation-components-with-save.jpg new file mode 100644 index 00000000000..0d740df6c5e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/invocation-components-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg new file mode 100644 index 00000000000..7952889b11c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/lineage-console-tab.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/merge-conflict.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/merge-conflict.jpg new file mode 100644 index 00000000000..3cbd1524047 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/merge-conflict.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/prettier.gif b/website/static/img/docs/dbt-cloud/cloud-ide/prettier.gif new file mode 100644 index 00000000000..befe65b0ad5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/prettier.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-clean.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-clean.jpg new file mode 100644 index 00000000000..bdb3dfe757b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-clean.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-gitignore.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-gitignore.jpg new file mode 100644 index 00000000000..782454ff3ac Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/project-yml-gitignore.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/python-black.gif b/website/static/img/docs/dbt-cloud/cloud-ide/python-black.gif new file mode 100644 index 00000000000..09da6cdfe74 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/python-black.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/resolve-conflict.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/resolve-conflict.jpg new file mode 100644 index 00000000000..fc1bf36fee2 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/resolve-conflict.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg new file mode 100644 index 00000000000..084f6b33104 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/restart-ide.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/results-console-tab.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/results-console-tab.jpg new file mode 100644 index 00000000000..30fbd34fd53 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/results-console-tab.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg b/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg new file mode 100644 index 00000000000..5297456b2f9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/revert-uncommitted-changes-with-save.jpg differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/sqlfluff.gif b/website/static/img/docs/dbt-cloud/cloud-ide/sqlfluff.gif new file mode 100644 index 00000000000..d9b7fe98208 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/sqlfluff.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/sqlfmt.gif b/website/static/img/docs/dbt-cloud/cloud-ide/sqlfmt.gif new file mode 100644 index 00000000000..c0cefba0c92 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/sqlfmt.gif differ diff --git a/website/static/img/docs/dbt-cloud/cloud-ide/version-control-options-menu.png b/website/static/img/docs/dbt-cloud/cloud-ide/version-control-options-menu.png new file mode 100644 index 00000000000..927f451f803 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/cloud-ide/version-control-options-menu.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative-old.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative-old.png new file mode 100644 index 00000000000..fe66d932f92 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative-old.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative.png index fe66d932f92..5c18ed44972 100644 Binary files a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative.png and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-new-application-alternative.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-redirect-uri.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-redirect-uri.png index 7daaab4504d..3bb04467abd 100644 Binary files a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-redirect-uri.png and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/azure/azure-redirect-uri.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permission-sets-diagram.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permission-sets-diagram.png deleted file mode 100644 index e8a80f29266..00000000000 Binary files a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permission-sets-diagram.png and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials-old.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials-old.png new file mode 100644 index 00000000000..303f5ca8d79 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials-old.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials.png index 303f5ca8d79..b0f969fbfe5 100644 Binary files a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials.png and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/gsuite/gsuite-sso-credentials.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top-old.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top-old.png new file mode 100644 index 00000000000..53876b8af50 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top-old.png differ diff --git a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top.png b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top.png index 53876b8af50..6f44c97413f 100644 Binary files a/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top.png and b/website/static/img/docs/dbt-cloud/dbt-cloud-enterprise/okta/okta-3-saml-settings-top.png differ diff --git a/website/static/img/docs/dbt-cloud/defer-toggle.jpg b/website/static/img/docs/dbt-cloud/defer-toggle.jpg new file mode 100644 index 00000000000..7bd5a1c1283 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/defer-toggle.jpg differ diff --git a/website/static/img/docs/dbt-cloud/delete_projects_from_dbt_cloud_20221023.gif b/website/static/img/docs/dbt-cloud/delete_projects_from_dbt_cloud_20221023.gif new file mode 100644 index 00000000000..246c912c55b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/delete_projects_from_dbt_cloud_20221023.gif differ diff --git a/website/static/img/docs/dbt-cloud/delete_user_20221023.gif b/website/static/img/docs/dbt-cloud/delete_user_20221023.gif new file mode 100644 index 00000000000..07ec7375ee5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/delete_user_20221023.gif differ diff --git a/website/static/img/docs/dbt-cloud/deployment/access-logs.gif b/website/static/img/docs/dbt-cloud/deployment/access-logs.gif new file mode 100644 index 00000000000..f3da267745c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/access-logs.gif differ diff --git a/website/static/img/docs/dbt-cloud/deployment/deploy-scheduler.jpg b/website/static/img/docs/dbt-cloud/deployment/deploy-scheduler.jpg new file mode 100644 index 00000000000..d10a6942e04 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/deploy-scheduler.jpg differ diff --git a/website/static/img/docs/dbt-cloud/deployment/run-error-message.jpg b/website/static/img/docs/dbt-cloud/deployment/run-error-message.jpg new file mode 100644 index 00000000000..dc9226c93d5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/run-error-message.jpg differ diff --git a/website/static/img/docs/dbt-cloud/deployment/run-history.jpg b/website/static/img/docs/dbt-cloud/deployment/run-history.jpg new file mode 100644 index 00000000000..9628a887f43 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/run-history.jpg differ diff --git a/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg b/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg new file mode 100644 index 00000000000..8ab14b8ce2b Binary files /dev/null and b/website/static/img/docs/dbt-cloud/deployment/run-overview.jpg differ diff --git a/website/static/img/docs/dbt-cloud/disconnect-repo.gif b/website/static/img/docs/dbt-cloud/disconnect-repo.gif new file mode 100644 index 00000000000..135ae789fa8 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/disconnect-repo.gif differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg b/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg new file mode 100644 index 00000000000..ebe7eba9edf Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/data-freshness-metadata.jpg differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/dbt-dag.jpg b/website/static/img/docs/dbt-cloud/discovery-api/dbt-dag.jpg new file mode 100644 index 00000000000..b5293ea74b2 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/dbt-dag.jpg differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/discovery-api-figure.png b/website/static/img/docs/dbt-cloud/discovery-api/discovery-api-figure.png new file mode 100644 index 00000000000..01bed5b0d7c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/discovery-api-figure.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/example-dag.png b/website/static/img/docs/dbt-cloud/discovery-api/example-dag.png new file mode 100644 index 00000000000..e9b10b41421 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/example-dag.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/graphql.jpg b/website/static/img/docs/dbt-cloud/discovery-api/graphql.jpg new file mode 100644 index 00000000000..dc4d223b29d Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/graphql.jpg differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/graphql_header.jpg b/website/static/img/docs/dbt-cloud/discovery-api/graphql_header.jpg new file mode 100644 index 00000000000..d2f482de893 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/graphql_header.jpg differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph-with-freshness-info.png b/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph-with-freshness-info.png new file mode 100644 index 00000000000..87ce3b7b147 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph-with-freshness-info.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph.png b/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph.png new file mode 100644 index 00000000000..ec01ce2fc77 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/lineage-graph.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/model-timing.jpg b/website/static/img/docs/dbt-cloud/discovery-api/model-timing.jpg new file mode 100644 index 00000000000..155108aefb7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/model-timing.jpg differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/plot-of-executiontime.png b/website/static/img/docs/dbt-cloud/discovery-api/plot-of-executiontime.png new file mode 100644 index 00000000000..bd80db05f21 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/plot-of-executiontime.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/plot-of-runelapsedtime.png b/website/static/img/docs/dbt-cloud/discovery-api/plot-of-runelapsedtime.png new file mode 100644 index 00000000000..b4a590617fd Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/plot-of-runelapsedtime.png differ diff --git a/website/static/img/docs/dbt-cloud/discovery-api/source-freshness-page.png b/website/static/img/docs/dbt-cloud/discovery-api/source-freshness-page.png new file mode 100644 index 00000000000..411754a847c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/discovery-api/source-freshness-page.png differ diff --git a/website/static/img/docs/dbt-cloud/downgrade-dev-flow.gif b/website/static/img/docs/dbt-cloud/downgrade-dev-flow.gif new file mode 100644 index 00000000000..1164308d2ec Binary files /dev/null and b/website/static/img/docs/dbt-cloud/downgrade-dev-flow.gif differ diff --git a/website/static/img/docs/dbt-cloud/enterprise-upgrade.gif b/website/static/img/docs/dbt-cloud/enterprise-upgrade.gif new file mode 100644 index 00000000000..32dbbc38761 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/enterprise-upgrade.gif differ diff --git a/website/static/img/docs/dbt-cloud/faq-account-settings-billing.jpg b/website/static/img/docs/dbt-cloud/faq-account-settings-billing.jpg new file mode 100644 index 00000000000..44d977de4e3 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/faq-account-settings-billing.jpg differ diff --git a/website/static/img/docs/dbt-cloud/faq-account-settings-enterprise.jpg b/website/static/img/docs/dbt-cloud/faq-account-settings-enterprise.jpg new file mode 100644 index 00000000000..28d4f9b4338 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/faq-account-settings-enterprise.jpg differ diff --git a/website/static/img/docs/dbt-cloud/faq-account-settings-users.jpg b/website/static/img/docs/dbt-cloud/faq-account-settings-users.jpg new file mode 100644 index 00000000000..8da52272049 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/faq-account-settings-users.jpg differ diff --git a/website/static/img/docs/dbt-cloud/ip-restricted-email.png b/website/static/img/docs/dbt-cloud/ip-restricted-email.png new file mode 100644 index 00000000000..bc03bf61029 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/ip-restricted-email.png differ diff --git a/website/static/img/docs/dbt-cloud/ip-restricted-sso.png b/website/static/img/docs/dbt-cloud/ip-restricted-sso.png new file mode 100644 index 00000000000..3f7b12142d1 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/ip-restricted-sso.png differ diff --git a/website/static/img/docs/dbt-cloud/model-timing.jpg b/website/static/img/docs/dbt-cloud/model-timing.jpg new file mode 100644 index 00000000000..7ae45eee832 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/model-timing.jpg differ diff --git a/website/static/img/docs/dbt-cloud/on-premises/data-flows.png b/website/static/img/docs/dbt-cloud/on-premises/data-flows.png index 1158d7e7a36..efdc8b562f4 100644 Binary files a/website/static/img/docs/dbt-cloud/on-premises/data-flows.png and b/website/static/img/docs/dbt-cloud/on-premises/data-flows.png differ diff --git a/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif b/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif new file mode 100644 index 00000000000..135ae789fa8 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/on-premises/disconnect-repo.gif differ diff --git a/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png b/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png deleted file mode 100644 index 08ea839b002..00000000000 Binary files a/website/static/img/docs/dbt-cloud/on-premises/self-signed-cert.png and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink1.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink1.png new file mode 100644 index 00000000000..092f7361fd4 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink1.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink2.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink2.png new file mode 100644 index 00000000000..4a490599b3a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink2.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink3.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink3.png new file mode 100644 index 00000000000..28b1c9cc6ab Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink3.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink4.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink4.png new file mode 100644 index 00000000000..300b9cf4f16 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink4.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink5.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink5.png new file mode 100644 index 00000000000..35adf4d33a1 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink5.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftprivatelink6.png b/website/static/img/docs/dbt-cloud/redshiftprivatelink6.png new file mode 100644 index 00000000000..ed34140659c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftprivatelink6.png differ diff --git a/website/static/img/docs/dbt-cloud/redshiftserverless.png b/website/static/img/docs/dbt-cloud/redshiftserverless.png new file mode 100644 index 00000000000..57b1dbb3cf7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/redshiftserverless.png differ diff --git a/website/static/img/docs/dbt-cloud/refresh-ide/building.gif b/website/static/img/docs/dbt-cloud/refresh-ide/building.gif index b521eaa18d5..335399da311 100644 Binary files a/website/static/img/docs/dbt-cloud/refresh-ide/building.gif and b/website/static/img/docs/dbt-cloud/refresh-ide/building.gif differ diff --git a/website/static/img/docs/dbt-cloud/refresh-ide/dev-credentials.jpg b/website/static/img/docs/dbt-cloud/refresh-ide/dev-credentials.jpg new file mode 100644 index 00000000000..cc64f74065c Binary files /dev/null and b/website/static/img/docs/dbt-cloud/refresh-ide/dev-credentials.jpg differ diff --git a/website/static/img/docs/dbt-cloud/rn-run-history.jpg b/website/static/img/docs/dbt-cloud/rn-run-history.jpg new file mode 100644 index 00000000000..443843f643e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/rn-run-history.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/connection-architecture.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/connection-architecture.jpg new file mode 100644 index 00000000000..663529ce4f3 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/connection-architecture.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/entity-lineage.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/entity-lineage.jpg new file mode 100644 index 00000000000..edd32e560a5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/entity-lineage.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg new file mode 100644 index 00000000000..9e624693aa0 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/new-sl-configure.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg new file mode 100644 index 00000000000..b6801e88bce Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-architecture.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg new file mode 100644 index 00000000000..d73b6167dba Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-example.jpg differ diff --git a/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg new file mode 100644 index 00000000000..41fe17c7654 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/semantic-layer/sl-configure-sl.jpg differ diff --git a/website/static/img/docs/dbt-cloud/snowflakeprivatelink1.png b/website/static/img/docs/dbt-cloud/snowflakeprivatelink1.png new file mode 100644 index 00000000000..2e688c2c018 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/snowflakeprivatelink1.png differ diff --git a/website/static/img/docs/dbt-cloud/trial-team-flow.gif b/website/static/img/docs/dbt-cloud/trial-team-flow.gif new file mode 100644 index 00000000000..290402c7cf7 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/trial-team-flow.gif differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/09c886f-Screen_Shot_2019-02-08_at_4.54.41_PM.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/09c886f-Screen_Shot_2019-02-08_at_4.54.41_PM.png deleted file mode 100644 index 63be778551d..00000000000 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/09c886f-Screen_Shot_2019-02-08_at_4.54.41_PM.png and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/61536c9-Screen_Shot_2019-02-08_at_9.46.29_PM.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/61536c9-Screen_Shot_2019-02-08_at_9.46.29_PM.png deleted file mode 100644 index 1fd34b2f29d..00000000000 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/61536c9-Screen_Shot_2019-02-08_at_9.46.29_PM.png and /dev/null differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-deferral.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-deferral.png index 5186b0a9a76..459c0ec616d 100644 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-deferral.png and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-deferral.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png new file mode 100644 index 00000000000..1ef43a9588e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-job-adv-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-workflow.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-workflow.png new file mode 100644 index 00000000000..08468d431ef Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/ci-workflow.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png new file mode 100644 index 00000000000..7da23bd1dc9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-ci-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png new file mode 100644 index 00000000000..88b8047fef5 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-deploy-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-new-job.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-new-job.jpg new file mode 100644 index 00000000000..4183e16389e Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/create-new-job.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/dashboard-status-tiles/sigma-embed.gif b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dashboard-status-tiles/sigma-embed.gif new file mode 100644 index 00000000000..d368a031bd4 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dashboard-status-tiles/sigma-embed.gif differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth-user.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth-user.png new file mode 100644 index 00000000000..aecf99d726a Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth-user.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth.png new file mode 100644 index 00000000000..bb32fab2afb Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/dbt-cloud-enterprise/DBX-auth/dbt-databricks-oauth.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png new file mode 100644 index 00000000000..8ed834a23bc Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/deploy-job-adv-settings.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-pr.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-pr.png new file mode 100644 index 00000000000..9f649fa0305 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-github-pr.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-smart-cancel-job.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-smart-cancel-job.png new file mode 100644 index 00000000000..10cce4922a9 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/example-smart-cancel-job.png differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/execution-settings.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/execution-settings.jpg new file mode 100644 index 00000000000..dc45a0d45a2 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/execution-settings.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg new file mode 100644 index 00000000000..3b5929c3141 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/extended-attributes.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.jpg new file mode 100644 index 00000000000..731f18e8d68 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/fail-dbtdeps.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/job-commands.gif b/website/static/img/docs/dbt-cloud/using-dbt-cloud/job-commands.gif new file mode 100644 index 00000000000..6f579e96125 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/job-commands.gif differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg new file mode 100644 index 00000000000..6eb689b0cdd Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/prod-settings.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/skipped-jobs.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/skipped-jobs.jpg new file mode 100644 index 00000000000..56e3c1fdae3 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/skipped-jobs.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/triggers.jpg b/website/static/img/docs/dbt-cloud/using-dbt-cloud/triggers.jpg new file mode 100644 index 00000000000..eff48b00611 Binary files /dev/null and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/triggers.jpg differ diff --git a/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png b/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png index 63fa5a1450d..63cf7d96341 100644 Binary files a/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png and b/website/static/img/docs/dbt-cloud/using-dbt-cloud/using_ci_dbt_cloud.png differ diff --git a/website/static/img/docs/dbt-versions/experimental-feats.png b/website/static/img/docs/dbt-versions/experimental-feats.png new file mode 100644 index 00000000000..f4c353b8bb4 Binary files /dev/null and b/website/static/img/docs/dbt-versions/experimental-feats.png differ diff --git a/website/static/img/docs/deploy/native-retry.gif b/website/static/img/docs/deploy/native-retry.gif new file mode 100644 index 00000000000..020a9958fc5 Binary files /dev/null and b/website/static/img/docs/deploy/native-retry.gif differ diff --git a/website/static/img/docs/release-notes/ci-checks.png b/website/static/img/docs/release-notes/ci-checks.png new file mode 100644 index 00000000000..64750857fce Binary files /dev/null and b/website/static/img/docs/release-notes/ci-checks.png differ diff --git a/website/static/img/docs/release-notes/ci-job-setup.gif b/website/static/img/docs/release-notes/ci-job-setup.gif new file mode 100644 index 00000000000..56beea4ab88 Binary files /dev/null and b/website/static/img/docs/release-notes/ci-job-setup.gif differ diff --git a/website/static/img/docs/release-notes/ci-job-tag.png b/website/static/img/docs/release-notes/ci-job-tag.png new file mode 100644 index 00000000000..02f2cdc895c Binary files /dev/null and b/website/static/img/docs/release-notes/ci-job-tag.png differ diff --git a/website/static/img/docs/release-notes/dbt-cloud-versions.png b/website/static/img/docs/release-notes/dbt-cloud-versions.png new file mode 100644 index 00000000000..26c9f5fa0a2 Binary files /dev/null and b/website/static/img/docs/release-notes/dbt-cloud-versions.png differ diff --git a/website/static/img/docs/release-notes/new-jobs-default-as-off.png b/website/static/img/docs/release-notes/new-jobs-default-as-off.png new file mode 100644 index 00000000000..2f3bb6fc78f Binary files /dev/null and b/website/static/img/docs/release-notes/new-jobs-default-as-off.png differ diff --git a/website/static/img/docs/release-notes/run-details-and-logs-improvements.gif b/website/static/img/docs/release-notes/run-details-and-logs-improvements.gif new file mode 100644 index 00000000000..f3da267745c Binary files /dev/null and b/website/static/img/docs/release-notes/run-details-and-logs-improvements.gif differ diff --git a/website/static/img/docs/release-notes/run-history-improvements.gif b/website/static/img/docs/release-notes/run-history-improvements.gif new file mode 100644 index 00000000000..29ef5c729f7 Binary files /dev/null and b/website/static/img/docs/release-notes/run-history-improvements.gif differ diff --git a/website/static/img/docs/running-a-dbt-project/dbt_cloud_job_prefect.jpg b/website/static/img/docs/running-a-dbt-project/dbt_cloud_job_prefect.jpg new file mode 100644 index 00000000000..8eb4d8656d9 Binary files /dev/null and b/website/static/img/docs/running-a-dbt-project/dbt_cloud_job_prefect.jpg differ diff --git a/website/static/img/docs/running-a-dbt-project/prefect_dag_dbt_cloud.jpg b/website/static/img/docs/running-a-dbt-project/prefect_dag_dbt_cloud.jpg new file mode 100644 index 00000000000..6fed0a58a77 Binary files /dev/null and b/website/static/img/docs/running-a-dbt-project/prefect_dag_dbt_cloud.jpg differ diff --git a/website/static/img/docs/terms/data-lineage/dag_example.jpg b/website/static/img/docs/terms/data-lineage/dag_example.jpg new file mode 100644 index 00000000000..3d1e4153590 Binary files /dev/null and b/website/static/img/docs/terms/data-lineage/dag_example.jpg differ diff --git a/website/static/img/file-icon.svg b/website/static/img/file-icon.svg new file mode 100644 index 00000000000..91b456dbd0f --- /dev/null +++ b/website/static/img/file-icon.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/static/img/folder-open.svg b/website/static/img/folder-open.svg new file mode 100644 index 00000000000..b08a3331716 --- /dev/null +++ b/website/static/img/folder-open.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/static/img/folder.svg b/website/static/img/folder.svg new file mode 100644 index 00000000000..bc1e18919b5 --- /dev/null +++ b/website/static/img/folder.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/static/img/guides/best-practices/environment-setup/many-branch-git.png b/website/static/img/guides/best-practices/environment-setup/many-branch-git.png new file mode 100644 index 00000000000..9c8077fa450 Binary files /dev/null and b/website/static/img/guides/best-practices/environment-setup/many-branch-git.png differ diff --git a/website/static/img/guides/best-practices/environment-setup/many-deployments-table.png b/website/static/img/guides/best-practices/environment-setup/many-deployments-table.png new file mode 100644 index 00000000000..db4c0a5ad47 Binary files /dev/null and b/website/static/img/guides/best-practices/environment-setup/many-deployments-table.png differ diff --git a/website/static/img/guides/best-practices/environment-setup/one-branch-git.png b/website/static/img/guides/best-practices/environment-setup/one-branch-git.png new file mode 100644 index 00000000000..51850114631 Binary files /dev/null and b/website/static/img/guides/best-practices/environment-setup/one-branch-git.png differ diff --git a/website/static/img/guides/best-practices/environment-setup/one-deployment-table.png b/website/static/img/guides/best-practices/environment-setup/one-deployment-table.png new file mode 100644 index 00000000000..7add8affcf7 Binary files /dev/null and b/website/static/img/guides/best-practices/environment-setup/one-deployment-table.png differ diff --git a/website/static/img/guides/best-practices/materializations/dbt-build-output.png b/website/static/img/guides/best-practices/materializations/dbt-build-output.png new file mode 100644 index 00000000000..a6d56c12315 Binary files /dev/null and b/website/static/img/guides/best-practices/materializations/dbt-build-output.png differ diff --git a/website/static/img/guides/best-practices/materializations/incremental-diagram.png b/website/static/img/guides/best-practices/materializations/incremental-diagram.png new file mode 100644 index 00000000000..44394e58408 Binary files /dev/null and b/website/static/img/guides/best-practices/materializations/incremental-diagram.png differ diff --git a/website/static/img/guides/best-practices/materializations/model-timing-diagram.png b/website/static/img/guides/best-practices/materializations/model-timing-diagram.png new file mode 100644 index 00000000000..6b0f6e5b61b Binary files /dev/null and b/website/static/img/guides/best-practices/materializations/model-timing-diagram.png differ diff --git a/website/static/img/guides/best-practices/materializations/snowflake-query-timing.png b/website/static/img/guides/best-practices/materializations/snowflake-query-timing.png new file mode 100644 index 00000000000..4597aea7efb Binary files /dev/null and b/website/static/img/guides/best-practices/materializations/snowflake-query-timing.png differ diff --git a/website/static/img/guides/best-practices/materializations/tables-and-views.png b/website/static/img/guides/best-practices/materializations/tables-and-views.png new file mode 100644 index 00000000000..b106b1b286c Binary files /dev/null and b/website/static/img/guides/best-practices/materializations/tables-and-views.png differ diff --git a/website/static/img/guides/best-practices/semantic-layer/orders_erd.png b/website/static/img/guides/best-practices/semantic-layer/orders_erd.png new file mode 100644 index 00000000000..56e35256d83 Binary files /dev/null and b/website/static/img/guides/best-practices/semantic-layer/orders_erd.png differ diff --git a/website/static/img/guides/databricks-guides/databricks-connection-env-vars.png b/website/static/img/guides/databricks-guides/databricks-connection-env-vars.png new file mode 100644 index 00000000000..03b40e6a34f Binary files /dev/null and b/website/static/img/guides/databricks-guides/databricks-connection-env-vars.png differ diff --git a/website/static/img/guides/databricks-guides/databricks-env-variables.png b/website/static/img/guides/databricks-guides/databricks-env-variables.png new file mode 100644 index 00000000000..95554989970 Binary files /dev/null and b/website/static/img/guides/databricks-guides/databricks-env-variables.png differ diff --git a/website/static/img/guides/databricks-guides/deployment_monitor_dbx.png b/website/static/img/guides/databricks-guides/deployment_monitor_dbx.png new file mode 100644 index 00000000000..cd3fab70cfb Binary files /dev/null and b/website/static/img/guides/databricks-guides/deployment_monitor_dbx.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/1-python-model-details-output.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/1-python-model-details-output.png new file mode 100644 index 00000000000..256a1772c29 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/1-python-model-details-output.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/2-fastest-pit-stops-preview.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/2-fastest-pit-stops-preview.png new file mode 100644 index 00000000000..fb3eb9a59d0 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/2-fastest-pit-stops-preview.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/3-lap-times-trends-preview.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/3-lap-times-trends-preview.png new file mode 100644 index 00000000000..644ceefe5fa Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/10-python-transformations/3-lap-times-trends-preview.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep/1-completed-ml-data-prep.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep/1-completed-ml-data-prep.png new file mode 100644 index 00000000000..97f306277de Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/11-machine-learning-prep/1-completed-ml-data-prep.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/1-preview-train-test-position.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/1-preview-train-test-position.png new file mode 100644 index 00000000000..5e290483bc3 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/1-preview-train-test-position.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/2-list-snowflake-stage.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/2-list-snowflake-stage.png new file mode 100644 index 00000000000..03d1c057aa1 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/2-list-snowflake-stage.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/3-view-snowflake-query-history.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/3-view-snowflake-query-history.png new file mode 100644 index 00000000000..45242db2301 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/12-machine-learning-training-prediction/3-view-snowflake-query-history.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/1-generic-testing-file-tree.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/1-generic-testing-file-tree.png new file mode 100644 index 00000000000..471cb7828fb Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/1-generic-testing-file-tree.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/2-macro-testing.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/2-macro-testing.png new file mode 100644 index 00000000000..64a5a8fb532 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/2-macro-testing.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/3-gte-macro-applied-to-pit-stops.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/3-gte-macro-applied-to-pit-stops.png new file mode 100644 index 00000000000..7af19986312 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/3-gte-macro-applied-to-pit-stops.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/4-custom-singular-test.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/4-custom-singular-test.png new file mode 100644 index 00000000000..79645ab54fd Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/4-custom-singular-test.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/5-running-tests-on-python-models.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/5-running-tests-on-python-models.png new file mode 100644 index 00000000000..ea595861475 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/5-running-tests-on-python-models.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/6-testing-output-details.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/6-testing-output-details.png new file mode 100644 index 00000000000..fd6dfd2637b Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/13-testing/6-testing-output-details.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/1-docs-icon.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/1-docs-icon.png new file mode 100644 index 00000000000..df55a28b845 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/1-docs-icon.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/2-view-docblock-description.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/2-view-docblock-description.png new file mode 100644 index 00000000000..91d3fe1cdfb Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/2-view-docblock-description.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/3-mini-lineage-docs.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/3-mini-lineage-docs.png new file mode 100644 index 00000000000..45e87467e13 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/3-mini-lineage-docs.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/4-full-dag-docs.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/4-full-dag-docs.png new file mode 100644 index 00000000000..80a5fcc035e Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/14-documentation/4-full-dag-docs.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/1-merge-to-main-branch.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/1-merge-to-main-branch.png new file mode 100644 index 00000000000..7575726c4c0 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/1-merge-to-main-branch.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/2-ui-select-environments.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/2-ui-select-environments.png new file mode 100644 index 00000000000..8b5949b9645 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/2-ui-select-environments.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/3-update-deployment-credentials-production.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/3-update-deployment-credentials-production.png new file mode 100644 index 00000000000..44eefc1d4b9 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/3-update-deployment-credentials-production.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/4-run-production-job.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/4-run-production-job.png new file mode 100644 index 00000000000..1faf973f790 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/4-run-production-job.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/5-job-details.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/5-job-details.png new file mode 100644 index 00000000000..8cc7cc04ed7 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/5-job-details.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/6-all-models-generated.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/6-all-models-generated.png new file mode 100644 index 00000000000..9b3a349b905 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/15-deployment/6-all-models-generated.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/1-snowflake-trial-AWS-setup.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/1-snowflake-trial-AWS-setup.png new file mode 100644 index 00000000000..d26da4c4e2c Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/1-snowflake-trial-AWS-setup.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/2-new-snowflake-account.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/2-new-snowflake-account.png new file mode 100644 index 00000000000..0d62bd4766c Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/2-new-snowflake-account.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/3-accept-anaconda-terms.jpeg b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/3-accept-anaconda-terms.jpeg new file mode 100644 index 00000000000..9c759b0d78a Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/3-accept-anaconda-terms.jpeg differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/4-enable-anaconda.jpeg b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/4-enable-anaconda.jpeg new file mode 100644 index 00000000000..543b6686619 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/2-snowflake-configuration/4-enable-anaconda.jpeg differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/1-rename-worksheet-and-select-warehouse.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/1-rename-worksheet-and-select-warehouse.png new file mode 100644 index 00000000000..099ccf0fcc5 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/1-rename-worksheet-and-select-warehouse.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/2-load-data-from-s3.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/2-load-data-from-s3.png new file mode 100644 index 00000000000..72b3e37d78f Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/2-load-data-from-s3.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/3-create-new-worksheet-to-query-data.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/3-create-new-worksheet-to-query-data.png new file mode 100644 index 00000000000..e1175edf958 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/3-create-new-worksheet-to-query-data.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/4-query-circuits-data.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/4-query-circuits-data.png new file mode 100644 index 00000000000..f14b18fa42e Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/3-connect-to-data-source/4-query-circuits-data.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/1-open-partner-connect.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/1-open-partner-connect.png new file mode 100644 index 00000000000..0a121ea70f2 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/1-open-partner-connect.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/2-partner-connect-optional-grant.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/2-partner-connect-optional-grant.png new file mode 100644 index 00000000000..bbfd1febd9c Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/2-partner-connect-optional-grant.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/3-connect-to-dbt.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/3-connect-to-dbt.png new file mode 100644 index 00000000000..7a64be032cd Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/3-connect-to-dbt.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/4-dbt-cloud-sign-up.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/4-dbt-cloud-sign-up.png new file mode 100644 index 00000000000..ae3f389a378 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/4-configure-dbt/4-dbt-cloud-sign-up.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/1-settings-gear-icon.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/1-settings-gear-icon.png new file mode 100644 index 00000000000..d27aef5dc9d Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/1-settings-gear-icon.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/10-confirm-example-models-built-in-snowflake.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/10-confirm-example-models-built-in-snowflake.png new file mode 100644 index 00000000000..c14a9b945b3 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/10-confirm-example-models-built-in-snowflake.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/2-credentials-edit-schema-name.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/2-credentials-edit-schema-name.png new file mode 100644 index 00000000000..72d977f2ce1 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/2-credentials-edit-schema-name.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/3-save-new-schema-name.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/3-save-new-schema-name.png new file mode 100644 index 00000000000..a82938aa094 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/3-save-new-schema-name.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/4-initialize-dbt-project.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/4-initialize-dbt-project.png new file mode 100644 index 00000000000..19822a43255 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/4-initialize-dbt-project.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/5-first-commit-and-push.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/5-first-commit-and-push.png new file mode 100644 index 00000000000..334102a14b6 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/5-first-commit-and-push.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/6-initalize-project.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/6-initalize-project.png new file mode 100644 index 00000000000..330367d3344 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/6-initalize-project.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/7-IDE-overview.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/7-IDE-overview.png new file mode 100644 index 00000000000..b0f053f71dd Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/7-IDE-overview.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/8-dbt-run-example-models.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/8-dbt-run-example-models.png new file mode 100644 index 00000000000..892f46d6d8d Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/8-dbt-run-example-models.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/9-second-model-details.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/9-second-model-details.png new file mode 100644 index 00000000000..41c56b119a5 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/5-development-schema-name/9-second-model-details.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/1-create-folder.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/1-create-folder.png new file mode 100644 index 00000000000..c7768cee356 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/1-create-folder.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/2-file-path.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/2-file-path.png new file mode 100644 index 00000000000..56b92946a47 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/2-file-path.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/3-tree-of-new-folders.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/3-tree-of-new-folders.png new file mode 100644 index 00000000000..1ff801833ec Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/7-folder-structure/3-tree-of-new-folders.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/1-staging-folder.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/1-staging-folder.png new file mode 100644 index 00000000000..3c7920b5e04 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/1-staging-folder.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/2-delete-example.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/2-delete-example.png new file mode 100644 index 00000000000..381ef2313c2 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/2-delete-example.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/3-successful-run-in-snowflake.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/3-successful-run-in-snowflake.png new file mode 100644 index 00000000000..be7d1c851d9 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/3-successful-run-in-snowflake.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/4-confirm-models.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/4-confirm-models.png new file mode 100644 index 00000000000..dec23673990 Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/8-sources-and-staging/4-confirm-models.png differ diff --git a/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations/1-dag.png b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations/1-dag.png new file mode 100644 index 00000000000..d22ad0076cc Binary files /dev/null and b/website/static/img/guides/dbt-ecosystem/dbt-python-snowpark/9-sql-transformations/1-dag.png differ diff --git a/website/static/img/guides/migration/versions/surrogate_key_behaviour.png b/website/static/img/guides/migration/versions/surrogate_key_behaviour.png new file mode 100644 index 00000000000..7313d03dd0d Binary files /dev/null and b/website/static/img/guides/migration/versions/surrogate_key_behaviour.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png new file mode 100644 index 00000000000..2c01ad18d06 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-dbt-cloud-pr.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png new file mode 100644 index 00000000000..456a146756d Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/bitbucket-run-on-pr.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-azure.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-azure.png new file mode 100644 index 00000000000..21147571d1f Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-azure.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png new file mode 100644 index 00000000000..b8e9c9befb5 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-api-key-bitbucket.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-azure-triggered.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-azure-triggered.png new file mode 100644 index 00000000000..930f066bae6 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-azure-triggered.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-bitbucket-triggered.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-bitbucket-triggered.png new file mode 100644 index 00000000000..67656d5ca52 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-cloud-job-bitbucket-triggered.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png new file mode 100644 index 00000000000..5a971b19b9f Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-macro-cleanup-pr.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-azure.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-azure.png new file mode 100644 index 00000000000..d0f65f29f2e Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-azure.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-bitbucket.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-bitbucket.png new file mode 100644 index 00000000000..ccc20a894f7 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/dbt-run-on-merge-bitbucket.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png new file mode 100644 index 00000000000..d50bbbb9183 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-bitbucket.png differ diff --git a/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png b/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png new file mode 100644 index 00000000000..b31a9f706d0 Binary files /dev/null and b/website/static/img/guides/orchestration/custom-cicd-pipelines/lint-on-push-logs-bitbucket.png differ diff --git a/website/static/img/guides/orchestration/webhooks/serverless-pagerduty/pagerduty-example-alarm.png b/website/static/img/guides/orchestration/webhooks/serverless-pagerduty/pagerduty-example-alarm.png new file mode 100644 index 00000000000..62f84ce2fca Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/serverless-pagerduty/pagerduty-example-alarm.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png b/website/static/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png new file mode 100644 index 00000000000..84805495608 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-common/catch-raw-hook.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-common/run-python.png b/website/static/img/guides/orchestration/webhooks/zapier-common/run-python.png new file mode 100644 index 00000000000..57c4e2959b0 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-common/run-python.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png b/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png new file mode 100644 index 00000000000..b72b84f82e4 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-ui.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png b/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png new file mode 100644 index 00000000000..2e1a07183b8 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-ms-teams/ms-teams-zap-config.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/code-example-alternate.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/code-example-alternate.png new file mode 100644 index 00000000000..082de67f6a9 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/code-example-alternate.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/extract-number.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/extract-number.png new file mode 100644 index 00000000000..563b24c46e5 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/extract-number.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/filter-config.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/filter-config.png new file mode 100644 index 00000000000..f583079bd10 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/filter-config.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-config.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-config.png new file mode 100644 index 00000000000..e1de8e8393b Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-config.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-filter.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-filter.png new file mode 100644 index 00000000000..6dd658997fd Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/message-trigger-filter.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/parent-slack-config.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/parent-slack-config.png new file mode 100644 index 00000000000..8b189af5070 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/parent-slack-config.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png new file mode 100644 index 00000000000..b8a5256bdb2 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/slack-thread-example.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config-alternate.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config-alternate.png new file mode 100644 index 00000000000..53876f7bfe8 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config-alternate.png differ diff --git a/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png b/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png new file mode 100644 index 00000000000..7b9b13c1cd0 Binary files /dev/null and b/website/static/img/guides/orchestration/webhooks/zapier-slack/thread-slack-config.png differ diff --git a/website/static/img/icons/alloydb.svg b/website/static/img/icons/alloydb.svg new file mode 100644 index 00000000000..acf06768ee6 --- /dev/null +++ b/website/static/img/icons/alloydb.svg @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/athena.svg b/website/static/img/icons/athena.svg new file mode 100644 index 00000000000..c2c6a81dd64 --- /dev/null +++ b/website/static/img/icons/athena.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/bigquery.svg b/website/static/img/icons/bigquery.svg new file mode 100644 index 00000000000..7900dffc1b6 --- /dev/null +++ b/website/static/img/icons/bigquery.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/command-line.svg b/website/static/img/icons/command-line.svg new file mode 100644 index 00000000000..ac653471ee4 --- /dev/null +++ b/website/static/img/icons/command-line.svg @@ -0,0 +1 @@ + diff --git a/website/static/img/icons/databricks.svg b/website/static/img/icons/databricks.svg new file mode 100644 index 00000000000..5417cae65a8 --- /dev/null +++ b/website/static/img/icons/databricks.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/dbt-bit.svg b/website/static/img/icons/dbt-bit.svg new file mode 100644 index 00000000000..6465b256272 --- /dev/null +++ b/website/static/img/icons/dbt-bit.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/dbt-core.svg b/website/static/img/icons/dbt-core.svg new file mode 100644 index 00000000000..f94f111c758 --- /dev/null +++ b/website/static/img/icons/dbt-core.svg @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/website/static/img/icons/dremio.svg b/website/static/img/icons/dremio.svg new file mode 100644 index 00000000000..9d6ad9eac25 --- /dev/null +++ b/website/static/img/icons/dremio.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/github-codespace.svg b/website/static/img/icons/github-codespace.svg new file mode 100644 index 00000000000..0aa83121041 --- /dev/null +++ b/website/static/img/icons/github-codespace.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/website/static/img/icons/github.svg b/website/static/img/icons/github.svg new file mode 100644 index 00000000000..cea00178bf6 --- /dev/null +++ b/website/static/img/icons/github.svg @@ -0,0 +1 @@ + diff --git a/website/static/img/icons/google-sheets-logo-icon.svg b/website/static/img/icons/google-sheets-logo-icon.svg new file mode 100644 index 00000000000..d080c1dd53d --- /dev/null +++ b/website/static/img/icons/google-sheets-logo-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/hex.svg b/website/static/img/icons/hex.svg new file mode 100755 index 00000000000..00431ffe299 --- /dev/null +++ b/website/static/img/icons/hex.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/website/static/img/icons/lightdash.svg b/website/static/img/icons/lightdash.svg new file mode 100644 index 00000000000..96f4676e7ee --- /dev/null +++ b/website/static/img/icons/lightdash.svg @@ -0,0 +1,96 @@ + + + + + + + + \ No newline at end of file diff --git a/website/static/img/icons/materialize.svg b/website/static/img/icons/materialize.svg new file mode 100644 index 00000000000..92f693cd94f --- /dev/null +++ b/website/static/img/icons/materialize.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + diff --git a/website/static/img/icons/mode.svg b/website/static/img/icons/mode.svg new file mode 100644 index 00000000000..269c182cd8b --- /dev/null +++ b/website/static/img/icons/mode.svg @@ -0,0 +1,165 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/website/static/img/icons/oracle.svg b/website/static/img/icons/oracle.svg new file mode 100644 index 00000000000..6868dea2eb3 --- /dev/null +++ b/website/static/img/icons/oracle.svg @@ -0,0 +1,47 @@ + + + + + \ No newline at end of file diff --git a/website/static/img/icons/redshift.svg b/website/static/img/icons/redshift.svg new file mode 100644 index 00000000000..e57888813e3 --- /dev/null +++ b/website/static/img/icons/redshift.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/snowflake.svg b/website/static/img/icons/snowflake.svg new file mode 100644 index 00000000000..618bbdc597b --- /dev/null +++ b/website/static/img/icons/snowflake.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/starburst-partner-logo.svg b/website/static/img/icons/starburst-partner-logo.svg new file mode 100644 index 00000000000..979307d9b60 --- /dev/null +++ b/website/static/img/icons/starburst-partner-logo.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/starburst.svg b/website/static/img/icons/starburst.svg new file mode 100644 index 00000000000..dbbe5a252ca --- /dev/null +++ b/website/static/img/icons/starburst.svg @@ -0,0 +1,4 @@ + + + + diff --git a/website/static/img/icons/tableau-software.svg b/website/static/img/icons/tableau-software.svg new file mode 100644 index 00000000000..28996f1dadd --- /dev/null +++ b/website/static/img/icons/tableau-software.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/teradata.svg b/website/static/img/icons/teradata.svg new file mode 100644 index 00000000000..cbbfab92d66 --- /dev/null +++ b/website/static/img/icons/teradata.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/alloydb.svg b/website/static/img/icons/white/alloydb.svg new file mode 100644 index 00000000000..acf06768ee6 --- /dev/null +++ b/website/static/img/icons/white/alloydb.svg @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/white/athena.svg b/website/static/img/icons/white/athena.svg new file mode 100644 index 00000000000..c2c6a81dd64 --- /dev/null +++ b/website/static/img/icons/white/athena.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/bigquery.svg b/website/static/img/icons/white/bigquery.svg new file mode 100644 index 00000000000..7900dffc1b6 --- /dev/null +++ b/website/static/img/icons/white/bigquery.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/white/command-line.svg b/website/static/img/icons/white/command-line.svg new file mode 100644 index 00000000000..37911da7cce --- /dev/null +++ b/website/static/img/icons/white/command-line.svg @@ -0,0 +1 @@ + diff --git a/website/static/img/icons/white/databricks.svg b/website/static/img/icons/white/databricks.svg new file mode 100644 index 00000000000..5417cae65a8 --- /dev/null +++ b/website/static/img/icons/white/databricks.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/white/dbt-bit.svg b/website/static/img/icons/white/dbt-bit.svg new file mode 100644 index 00000000000..6465b256272 --- /dev/null +++ b/website/static/img/icons/white/dbt-bit.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/dbt-core.svg b/website/static/img/icons/white/dbt-core.svg new file mode 100644 index 00000000000..658b3664014 --- /dev/null +++ b/website/static/img/icons/white/dbt-core.svg @@ -0,0 +1,19 @@ + + + + + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/white/dremio.svg b/website/static/img/icons/white/dremio.svg new file mode 100644 index 00000000000..9d6ad9eac25 --- /dev/null +++ b/website/static/img/icons/white/dremio.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/white/github-codespace.svg b/website/static/img/icons/white/github-codespace.svg new file mode 100644 index 00000000000..7e0de497091 --- /dev/null +++ b/website/static/img/icons/white/github-codespace.svg @@ -0,0 +1,4 @@ + + + + diff --git a/website/static/img/icons/white/github.svg b/website/static/img/icons/white/github.svg new file mode 100644 index 00000000000..b83e0eb1584 --- /dev/null +++ b/website/static/img/icons/white/github.svg @@ -0,0 +1 @@ + diff --git a/website/static/img/icons/white/google-sheets-logo-icon.svg b/website/static/img/icons/white/google-sheets-logo-icon.svg new file mode 100644 index 00000000000..d080c1dd53d --- /dev/null +++ b/website/static/img/icons/white/google-sheets-logo-icon.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/hex.svg b/website/static/img/icons/white/hex.svg new file mode 100644 index 00000000000..00431ffe299 --- /dev/null +++ b/website/static/img/icons/white/hex.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/website/static/img/icons/white/lightdash.svg b/website/static/img/icons/white/lightdash.svg new file mode 100644 index 00000000000..96f4676e7ee --- /dev/null +++ b/website/static/img/icons/white/lightdash.svg @@ -0,0 +1,96 @@ + + + + + + + + \ No newline at end of file diff --git a/website/static/img/icons/white/materialize.svg b/website/static/img/icons/white/materialize.svg new file mode 100644 index 00000000000..92f693cd94f --- /dev/null +++ b/website/static/img/icons/white/materialize.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + diff --git a/website/static/img/icons/white/mode.svg b/website/static/img/icons/white/mode.svg new file mode 100644 index 00000000000..269c182cd8b --- /dev/null +++ b/website/static/img/icons/white/mode.svg @@ -0,0 +1,165 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/website/static/img/icons/white/oracle.svg b/website/static/img/icons/white/oracle.svg new file mode 100644 index 00000000000..6868dea2eb3 --- /dev/null +++ b/website/static/img/icons/white/oracle.svg @@ -0,0 +1,47 @@ + + + + + \ No newline at end of file diff --git a/website/static/img/icons/white/redshift.svg b/website/static/img/icons/white/redshift.svg new file mode 100644 index 00000000000..e57888813e3 --- /dev/null +++ b/website/static/img/icons/white/redshift.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/snowflake.svg b/website/static/img/icons/white/snowflake.svg new file mode 100644 index 00000000000..618bbdc597b --- /dev/null +++ b/website/static/img/icons/white/snowflake.svg @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/website/static/img/icons/white/starburst-partner-logo.svg b/website/static/img/icons/white/starburst-partner-logo.svg new file mode 100644 index 00000000000..979307d9b60 --- /dev/null +++ b/website/static/img/icons/white/starburst-partner-logo.svg @@ -0,0 +1,30 @@ + + + + + + + + + + + + + + + + diff --git a/website/static/img/icons/white/starburst.svg b/website/static/img/icons/white/starburst.svg new file mode 100644 index 00000000000..dbbe5a252ca --- /dev/null +++ b/website/static/img/icons/white/starburst.svg @@ -0,0 +1,4 @@ + + + + diff --git a/website/static/img/icons/white/tableau-software.svg b/website/static/img/icons/white/tableau-software.svg new file mode 100644 index 00000000000..28996f1dadd --- /dev/null +++ b/website/static/img/icons/white/tableau-software.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/icons/white/teradata.svg b/website/static/img/icons/white/teradata.svg new file mode 100644 index 00000000000..cbbfab92d66 --- /dev/null +++ b/website/static/img/icons/white/teradata.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/website/static/img/info-icon.svg b/website/static/img/info-icon.svg new file mode 100644 index 00000000000..b8cca1fa8d0 --- /dev/null +++ b/website/static/img/info-icon.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/static/img/node_color_example.png b/website/static/img/node_color_example.png index 83b26f5735a..a1a62742ca0 100644 Binary files a/website/static/img/node_color_example.png and b/website/static/img/node_color_example.png differ diff --git a/website/static/img/prep-start.jpg b/website/static/img/prep-start.jpg new file mode 100644 index 00000000000..6e3680354b8 Binary files /dev/null and b/website/static/img/prep-start.jpg differ diff --git a/website/static/img/quickstarts/dbt-cloud/add-privilege.png b/website/static/img/quickstarts/dbt-cloud/add-privilege.png new file mode 100644 index 00000000000..0dc16ddf442 Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/add-privilege.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/starburst-data-hierarchy.png b/website/static/img/quickstarts/dbt-cloud/starburst-data-hierarchy.png new file mode 100644 index 00000000000..ca93721e13d Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/starburst-data-hierarchy.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-config-s3.png b/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-config-s3.png new file mode 100644 index 00000000000..197eb500744 Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-config-s3.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-editor.png b/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-editor.png new file mode 100644 index 00000000000..45ec711ff5c Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/starburst-galaxy-editor.png differ diff --git a/website/static/img/quickstarts/dbt-cloud/test-connection-success.png b/website/static/img/quickstarts/dbt-cloud/test-connection-success.png new file mode 100644 index 00000000000..bb33137666b Binary files /dev/null and b/website/static/img/quickstarts/dbt-cloud/test-connection-success.png differ diff --git a/website/static/img/reference/dremio-setup/dbt-Samples.png b/website/static/img/reference/dremio-setup/dbt-Samples.png new file mode 100644 index 00000000000..41ec3c9bbb5 Binary files /dev/null and b/website/static/img/reference/dremio-setup/dbt-Samples.png differ diff --git a/website/static/img/reference/dremio-setup/dbt-SamplesPath.png b/website/static/img/reference/dremio-setup/dbt-SamplesPath.png new file mode 100644 index 00000000000..8b887ab2e84 Binary files /dev/null and b/website/static/img/reference/dremio-setup/dbt-SamplesPath.png differ diff --git a/website/static/img/reference/dremio-setup/dbt-Spaces.png b/website/static/img/reference/dremio-setup/dbt-Spaces.png new file mode 100644 index 00000000000..1080654cad9 Binary files /dev/null and b/website/static/img/reference/dremio-setup/dbt-Spaces.png differ diff --git a/website/static/img/reference/dremio-setup/dbt-SpacesPath.png b/website/static/img/reference/dremio-setup/dbt-SpacesPath.png new file mode 100644 index 00000000000..e7405ad127c Binary files /dev/null and b/website/static/img/reference/dremio-setup/dbt-SpacesPath.png differ diff --git a/website/static/img/run-start.jpg b/website/static/img/run-start.jpg new file mode 100644 index 00000000000..d5706ab8140 Binary files /dev/null and b/website/static/img/run-start.jpg differ diff --git a/website/static/img/seed-icon.svg b/website/static/img/seed-icon.svg new file mode 100644 index 00000000000..64c5395b45e --- /dev/null +++ b/website/static/img/seed-icon.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/static/js/onetrust.js b/website/static/js/onetrust.js index f7fbcbd51c6..0f987eef36a 100644 --- a/website/static/js/onetrust.js +++ b/website/static/js/onetrust.js @@ -1,3 +1,4 @@ +/* eslint-disable */ function openPreferenceCenter() { if(window?.OneTrust?.ToggleInfoDisplay) { console.log('opening center') diff --git a/website/vercel.json b/website/vercel.json new file mode 100644 index 00000000000..14b3a0a6af0 --- /dev/null +++ b/website/vercel.json @@ -0,0 +1,4026 @@ +{ + "cleanUrls": true, + "trailingSlash": false, + "redirects": [ + { + "source": "/docs/build/metricflow-cli", + "destination": "/docs/build/metricflow-commands", + "permanent": true + }, + { + "source": "/docs/core/about-the-cli", + "destination": "/docs/core/about-dbt-core", + "permanent": true + }, + { + "source": "/docs/cloud/about-cloud/about-cloud-ide", + "destination": "/docs/cloud/about-cloud-develop", + "permanent": true + }, + { + "source": "/faqs/models/reference-models-in-another-project", + "destination": "/docs/collaborate/govern/project-dependencies", + "permanent": true + }, + { + "source": "/faqs/Models/reference-models-in-another-project", + "destination": "/docs/collaborate/govern/project-dependencies", + "permanent": true + }, + { + "source": "/docs/deploy/job-triggers", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/docs/deploy/job-settings", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/docs/deploy/dbt-cloud-job", + "destination": "/docs/deploy/deploy-jobs", + "permanent": true + }, + { + "source": "/faqs/environments/beta-release", + "destination": "/docs/dbt-versions/product-lifecycles", + "permanent": true + }, + { + "source": "/docs/deploy/slim-ci-jobs", + "destination": "/docs/deploy/ci-jobs", + "permanent": true + }, + { + "source": "/guides/dbt-ecosystem/sl-partner-integration-guide", + "destination": "/docs/use-dbt-semantic-layer/avail-sl-integrations", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/dbt-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/dbt-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/set-up-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/setup-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/setup-dbt-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/setup-sl", + "permanent": true + }, + { + "source": "/docs/use-dbt-semantic-layer/quickstart-semantic-layer", + "destination": "/docs/use-dbt-semantic-layer/quickstart-sl", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/environments-in-dbt", + "destination": "/docs/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/dbt-cloud-environments", + "destination": "/docs/deploy/dbt-cloud-environments", + "permanent": true + }, + { + "source": "/docs/collaborate/environments/dbt-core-environments", + "destination": "/docs/core/dbt-core-environments", + "permanent": true + }, + { + "source": "/docs/cloud/manage-access/licenses-and-groups", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/deploy/cloud-ci-job", + "destination": "/docs/deploy/continuous-integration", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/bigquery", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/redshift", + "destination": "/quickstarts/redshift", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/snowflake", + "destination": "/quickstarts/snowflake", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/starburst-galaxy", + "destination": "/quickstarts/starburst-galaxy", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/codespace", + "destination": "/quickstarts/codespace", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/manual-install", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/deploy/project-state", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/reference/global-configs", + "destination": "/reference/global-configs/about-global-configs", + "permanent": true + }, + { + "source": "/docs/quickstarts/overview", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#verified-adapters", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#community-adapters", + "destination": "/docs/community-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#adapter-installation", + "destination": "/docs/connect-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#adapter-taxonomy", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#verified-by-dbt-labs", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#maintainers", + "destination": "/docs/connect-adapters#maintainers", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#contributing-to-dbt-core-adapters", + "destination": "/docs/contribute-core-adapters", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#contributing-to-a-pre-existing-adapter", + "destination": "/docs/contribute-core-adapters#contribute-to-a-pre-existing-adapter", + "permanent": true + }, + { + "source": "/docs/supported-data-platforms#creating-a-new-adapter", + "destination": "/docs/contribute-core-adapters#create-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/core/connection-profiles", + "destination": "/docs/core/connect-data-platform/connection-profiles", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/bigquery-setup", + "destination": "/docs/core/connect-data-platform/bigquery-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/postgres-setup", + "destination": "/docs/core/connect-data-platform/postgres-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/redshift-setup", + "destination": "/docs/core/connect-data-platform/redshift-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/snowflake-setup", + "destination": "/docs/core/connect-data-platform/snowflake-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mssql-setup", + "destination": "/docs/core/connect-data-platform/mssql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/trino-setup", + "destination": "/docs/core/connect-data-platform/trino-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/singlestore-setup", + "destination": "/docs/core/connect-data-platform/singlestore-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/spark-setup", + "destination": "/docs/core/connect-data-platform/spark-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/databricks-setup", + "destination": "/docs/core/connect-data-platform/databricks-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/hive-setup", + "destination": "/docs/core/connect-data-platform/hive-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/exasol-setup", + "destination": "/docs/core/connect-data-platform/exasol-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/oracle-setup", + "destination": "/docs/core/connect-data-platform/oracle-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/azuresynapse-setup", + "destination": "/docs/core/connect-data-platform/azuresynapse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/dremio-setup", + "destination": "/docs/core/connect-data-platform/dremio-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/clickhouse-setup", + "destination": "/docs/core/connect-data-platform/clickhouse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/materialize-setup", + "destination": "/docs/core/connect-data-platform/materialize-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/rockset-setup", + "destination": "/docs/core/connect-data-platform/rockset-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/firebolt-setup", + "destination": "/docs/core/connect-data-platform/firebolt-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/teradata-setup", + "destination": "/docs/core/connect-data-platform/teradata-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/athena-setup", + "destination": "/docs/core/connect-data-platform/athena-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/vertica-setup", + "destination": "/docs/core/connect-data-platform/vertica-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/tidb-setup", + "destination": "/docs/core/connect-data-platform/tidb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/glue-setup", + "destination": "/docs/core/connect-data-platform/glue-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mindsdb-setup", + "destination": "/docs/core/connect-data-platform/mindsdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/greenplum-setup", + "destination": "/docs/core/connect-data-platform/greenplum-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/impala-setup", + "destination": "/docs/core/connect-data-platform/impala-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/layer-setup", + "destination": "/docs/core/connect-data-platform/layer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/iomete-setup", + "destination": "/docs/core/connect-data-platform/iomete-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/duckdb-setup", + "destination": "/docs/core/connect-data-platform/duckdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/sqlite-setup", + "destination": "/docs/core/connect-data-platform/sqlite-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/mysql-setup", + "destination": "/docs/core/connect-data-platform/mysql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/ibmdb2-setup", + "destination": "/docs/core/connect-data-platform/ibmdb2-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/alloydb-setup", + "destination": "/docs/core/connect-data-platform/alloydb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/doris-setup", + "destination": "/docs/core/connect-data-platform/doris-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/infer-setup", + "destination": "/docs/core/connect-data-platform/infer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/databend-setup", + "destination": "/docs/core/connect-data-platform/databend-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/fal-setup", + "destination": "/docs/core/connect-data-platform/fal-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/decodable-setup", + "destination": "/docs/core/connect-data-platform/decodable-setup", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-source", + "destination": "/docs/dbt-cloud-apis/discovery-schema-source", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-sources", + "destination": "/docs/dbt-cloud-apis/discovery-schema-sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-test", + "destination": "/docs/dbt-cloud-apis/discovery-schema-test", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-tests", + "destination": "/docs/dbt-cloud-apis/discovery-schema-tests", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-seed", + "destination": "/docs/dbt-cloud-apis/discovery-schema-seed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-seeds", + "destination": "/docs/dbt-cloud-apis/discovery-schema-seeds", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-snapshots", + "destination": "/docs/dbt-cloud-apis/discovery-schema-snapshots", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-model", + "destination": "/docs/dbt-cloud-apis/discovery-schema-model", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-models", + "destination": "/docs/dbt-cloud-apis/discovery-schema-models", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-modelByEnv", + "destination": "/docs/dbt-cloud-apis/discovery-schema-modelByEnv", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-metrics", + "destination": "/docs/dbt-cloud-apis/discovery-schema-metrics", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-metric", + "destination": "/docs/dbt-cloud-apis/discovery-schema-metric", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-exposures", + "destination": "/docs/dbt-cloud-apis/discovery-schema-exposures", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-schema-exposure", + "destination": "/docs/dbt-cloud-apis/discovery-schema-exposure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-use-case-guides", + "destination": "/docs/dbt-cloud-apis/discovery-use-cases-and-examples", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-api", + "destination": "/docs/dbt-cloud-apis/discovery-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-apis/metadata-querying", + "destination": "/docs/dbt-cloud-apis/discovery-querying", + "permanent": true + }, + { + "source": "/docs/core/connection-profiles#understanding-threads", + "destination": "/docs/running-a-dbt-project/using-threads", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/about-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/snowflake-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/redshift-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/databricks-privatelink", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/cloud/privatelink/ip-restrictions", + "destination": "/docs/cloud/secure/about-privatelink", + "permanent": true + }, + { + "source": "/docs/deploy/dbt-cloud-job#create-and-schedule-jobs", + "destination": "/docs/deploy/dbt-cloud-job#create-and-schedule-jobs", + "permanent": true + }, + { + "source": "/docs/cloud/dbt-cloud-tips", + "destination": "/docs/cloud/dbt-cloud-ide/dbt-cloud-tips", + "permanent": true + }, + { + "source": "/docs/cloud/develop-in-the-cloud", + "destination": "/docs/cloud/dbt-cloud-ide/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab", + "destination": "/docs/deploy/dbt-cloud-job#model-timing", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-core/quickstart", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/dbt-versions/release-notes/January-2022/model-timing-more", + "destination": "/docs/deploy/dbt-cloud-job#model-timing", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#dbt-cloud", + "destination": "/docs/deploy/dbt-cloud-job", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#airflow", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#prefect", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#run-dbt-in-production", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#on-prefect-2", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#on-prefect-1", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#dagster", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#automation-servers", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/deploy/deployments#cron", + "destination": "/docs/deploy/deployment-tools", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/enterprise-permissions#permission-sets", + "destination": "/docs/cloud/manage-access/enterprise-permissions#permission-sets", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/about-privatelink", + "destination": "/docs/cloud/privatelink/about-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/snowflake-privatelink", + "destination": "/docs/cloud/privatelink/snowflake-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/redshift-privatelink", + "destination": "/docs/cloud/privatelink/redshift-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/privatelink/databricks-privatelink", + "destination": "/docs/cloud/privatelink/databricks-privatelink", + "permanent": true + }, + { + "source": "/docs/get-started/dbt-cloud-features", + "destination": "/docs/cloud/about-cloud/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/deploy/regions-ip-addresses", + "destination": "/docs/cloud/about-cloud/regions-ip-addresses", + "permanent": true + }, + { + "source": "/docs/deploy/architecture", + "destination": "/docs/cloud/about-cloud/architecture", + "permanent": true + }, + { + "source": "/docs/deploy/single-tenant", + "destination": "/docs/cloud/about-cloud/tenancy", + "permanent": true + }, + { + "source": "/docs/deploy/multi-tenant", + "destination": "/docs/cloud/about-cloud/tenancy", + "permanent": true + }, + { + "source": "/docs/cloud/manage-access/about-access", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-github", + "destination": "/docs/cloud/git/connect-github", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-gitlab", + "destination": "/docs/cloud/git/connect-gitlab", + "permanent": true + }, + { + "source": "/docs/collaborate/git/connect-azure-devops", + "destination": "/docs/cloud/git/connect-azure-devops", + "permanent": true + }, + { + "source": "/docs/collaborate/git/setup-azure", + "destination": "/docs/cloud/git/setup-azure", + "permanent": true + }, + { + "source": "/docs/collaborate/git/authenticate-azure", + "destination": "/docs/cloud/git/authenticate-azure", + "permanent": true + }, + { + "source": "/docs/collaborate/git/import-a-project-by-git-url", + "destination": "/docs/cloud/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/about-publishing-models", + "destination": "/docs/collaborate/govern/about-model-governance", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-contracts", + "destination": "/docs/collaborate/govern/model-contracts", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-access", + "destination": "/docs/collaborate/govern/model-access", + "permanent": true + }, + { + "source": "/docs/collaborate/publish/model-versions", + "destination": "/docs/collaborate/govern/model-versions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/about-access", + "destination": "/docs/cloud/manage-access/about-user-access", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/seats-and-users", + "destination": "/docs/cloud/manage-access/seats-and-users", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/self-service-permissions", + "destination": "/docs/cloud/manage-access/self-service-permissions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/enterprise-permissions", + "destination": "/docs/cloud/manage-access/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/sso-overview", + "destination": "/docs/cloud/manage-access/sso-overview", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-saml-2.0", + "destination": "/docs/cloud/manage-access/set-up-sso-saml-2.0", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-okta", + "destination": "/docs/cloud/manage-access/set-up-sso-okta", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-google-workspace", + "destination": "/docs/cloud/manage-access/set-up-sso-google-workspace", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-sso-azure-active-directory", + "destination": "/docs/cloud/manage-access/set-up-sso-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-snowflake-oauth", + "destination": "/docs/cloud/manage-access/set-up-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/set-up-bigquery-oauth", + "destination": "/docs/cloud/manage-access/set-up-bigquery-oauth", + "permanent": true + }, + { + "source": "/docs/collaborate/manage-access/audit-log", + "destination": "/docs/cloud/manage-access/audit-log", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud", + "destination": "/docs/cloud/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/get-started/dbt-cloud-tips", + "destination": "/docs/cloud/dbt-cloud-tips", + "permanent": true + }, + { + "source": "/docs/get-started/installation", + "destination": "/docs/core/installation", + "permanent": true + }, + { + "source": "/docs/get-started/about-the-cli", + "destination": "/docs/core/about-the-cli", + "permanent": true + }, + { + "source": "/docs/get-started/homebrew-install", + "destination": "/docs/core/homebrew-install", + "permanent": true + }, + { + "source": "/docs/get-started/pip-install", + "destination": "/docs/core/pip-install", + "permanent": true + }, + { + "source": "/docs/get-started/docker-install", + "destination": "/docs/core/docker-install", + "permanent": true + }, + { + "source": "/docs/get-started/source-install", + "destination": "/docs/core/source-install", + "permanent": true + }, + { + "source": "/docs/get-started/connection-profiles", + "destination": "/docs/core/connection-profiles", + "permanent": true + }, + { + "source": "/docs/get-started/run-your-dbt-projects", + "destination": "/docs/running-a-dbt-project/run-your-dbt-projects", + "permanent": true + }, + { + "source": "/docs/get-started/learning-more/refactoring-legacy-sql", + "destination": "/guides/migration/tools/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/docs/get-started/learning-more/using-jinja", + "destination": "/guides/advanced/using-jinja", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/set-up-dbt-cloud", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/quickstarts/dbt-cloud/snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/quickstarts/dbt-cloud/redshift", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-bigquery", + "destination": "/docs/quickstarts/dbt-cloud/bigquery", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "destination": "/quickstarts/databricks", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/quickstarts/dbt-cloud/redshift", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/quickstarts/dbt-cloud/snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/schedule-a-job", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project", + "destination": "/docs/quickstarts/dbt-cloud/bigquery#add-tests-to-your-models", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "destination": "/quickstarts/bigquery?step=8", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/overview", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started-dbt-core", + "destination": "/docs/quickstarts/dbt-core/quickstart", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud#set-up-environments", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/get-started/develop-in-the-cloud#developer-credentials", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/getting-started/develop-in-the-cloud#setting-up-developer-credentials", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-redshift-and-postgres", + "destination": "/docs/get-started/connect-your-database#connecting-to-postgres-redshift-and-alloydb", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database#connecting-to-snowflake", + "destination": "/docs/get-started/connect-your-database#connecting-to-snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database#connecting-to-snowflake", + "destination": "/docs/cloud/connect-data-platform/connect-snowflake", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database#connecting-to-postgres-redshift-and-alloydb", + "destination": "/cloud/connect-data-platform/connect-redshift-postgresql-alloydb", + "permanent": true + }, + { + "source": "/docs/cloud/connect-data-platform/connect-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/faqs/connecting-to-two-dbs-not-allowed", + "destination": "/faqs/warehouse/connecting-to-two-dbs-not-allowed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/ide-beta", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/bigquery", + "destination": "/quickstarts/bigquery", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/redshift", + "destination": "/quickstarts/redshift", + "permanent": true + }, + { + "source": "/docs/quickstarts/dbt-cloud/snowflake", + "destination": "/quickstarts/snowflake", + "permanent": true + }, + { + "source": "/quickstarts/starburst-galaxy", + "destination": "/quickstarts/starburst-galaxy", + "permanent": true + }, + { + "source": "/quickstarts/codespace", + "destination": "/quickstarts/codespace", + "permanent": true + }, + { + "source": "/quickstarts/manual-install", + "destination": "/quickstarts/manual-install", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-model-timing-tab", + "destination": "/docs/get-started/dbt-cloud-features#model-timing-dashboard", + "permanent": true + }, + { + "source": "/docs/dbt-cloud", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/get-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-overview", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/artifacts", + "destination": "/docs/deploy/artifacts", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/python-models", + "destination": "/docs/build/python-models", + "permanent": true + }, + { + "source": "/docs/deploy/regions", + "destination": "/docs/deploy/regions-ip-addresses", + "permanent": true + }, + { + "source": "/advanced/adapter-development/1-what-are-adapters", + "destination": "/guides/dbt-ecosystem/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/3-building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/advanced/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/1-what-are-adapters", + "destination": "/guides/dbt-ecosystem/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/legacy/debugging-errors", + "destination": "/guides/best-practices/debugging-errors", + "permanent": true + }, + { + "source": "/guides/legacy/writing-custom-generic-tests", + "destination": "/guides/best-practices/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/guides/legacy/creating-new-materializations", + "destination": "/guides/advanced/creating-new-materializations", + "permanent": true + }, + { + "source": "/guides/getting-started", + "destination": "/docs/get-started/getting-started/overview", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/building-your-first-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/docs/get-started/getting-started/create-a-project", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/build-your-first-models", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/schedule-a-job", + "destination": "/docs/get-started/getting-started/building-your-first-project/schedule-a-job", + "permanent": true + }, + { + "source": "/guides/getting-started/building-your-first-project/test-and-document-your-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/test-and-document-your-project", + "permanent": true + }, + { + "source": "/guides/getting-started/create-a-project", + "destination": "/docs/get-started/getting-started/building-your-first-project/build-your-first-models301", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-bigquery", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-bigquery", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-databricks", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-databricks", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-redshift", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-redshift", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-set-up/setting-up-snowflake", + "destination": "/docs/get-started/getting-started/getting-set-up/setting-up-snowflake", + "permanent": true + }, + { + "source": "/guides/getting-started/getting-started", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more", + "destination": "/docs/get-started/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/getting-started-dbt-core", + "destination": "/docs/get-started/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/refactoring-legacy-sql", + "destination": "/docs/get-started/learning-more/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/guides/getting-started/learning-more/using-jinja", + "destination": "/docs/get-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-quickstart", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/cloud-quickstart", + "destination": "/docs/dbt-cloud/cloud-quickstart", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud", + "destination": "/docs/get-started/getting-started/set-up-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/docs/get-started/connect-your-database", + "destination": "/docs/cloud/connect-data-platform/about-connections", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/profile", + "destination": "/docs/get-started/connection-profiles", + "permanent": true + }, + { + "source": "/guides/best-practices/materializations/guides/best-practices/materializations/1-overview", + "destination": "/guides/best-practices/materializations/1-guide-overview", + "permanent": true + }, + { + "source": "/docs/deploy/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/guides/legacy/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/guides/migration/versions/Older%20versions/understanding-state", + "destination": "/docs/deploy/about-state", + "permanent": true + }, + { + "source": "/docs/collaborate/git/resolve-merge-conflicts", + "destination": "/docs/collaborate/git/merge-conflicts", + "permanent": true + }, + { + "source": "/docs/collaborate/environments", + "destination": "/docs/collaborate/environments/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/running-dbt-in-production", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications", + "destination": "/docs/deploy/job-notifications", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud", + "destination": "/docs/develop/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/january-2020-pricing-updates", + "destination": "https://www.getdbt.com/pricing/", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise", + "destination": "https://www.getdbt.com/pricing/", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/archival", + "destination": "/docs/build/snapshots", + "permanent": true + }, + { + "source": "/docs/about/license", + "destination": "/community/resources/contributor-license-agreements", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository", + "destination": "/docs/collaborate/git/managed-repository", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/release-notes", + "destination": "/docs/dbt-versions/dbt-cloud-release-notes", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/audit-log", + "destination": "/docs/collaborate/manage-access/audit-log", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-setting-up-bigquery-oauth", + "destination": "/docs/collaborate/manage-access/set-up-bigquery-oauth", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth", + "destination": "/docs/collaborate/manage-access/set-up-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta", + "destination": "/docs/collaborate/manage-access/set-up-sso-okta", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory", + "destination": "/docs/collaborate/manage-access/set-up-sso-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite", + "destination": "/docs/collaborate/manage-access/set-up-sso-google-workspace", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-saml-2.0", + "destination": "/docs/collaborate/manage-access/set-up-sso-saml-2.0", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/sso-overview", + "destination": "/docs/collaborate/manage-access/sso-overview", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/enterprise-permissions", + "destination": "/docs/collaborate/manage-access/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/self-service-permissions", + "destination": "/docs/collaborate/manage-access/self-service-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/cloud-seats-and-users", + "destination": "/docs/collaborate/manage-access/seats-and-users", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/access-control/access-control-overview", + "destination": "/docs/collaborate/manage-access/about-access", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation", + "destination": "/docs/collaborate/build-and-view-your-docs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/documentation", + "destination": "/docs/collaborate/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/managing-environments", + "destination": "/docs/collaborate/environments/environments-in-dbt", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url", + "destination": "/docs/collaborate/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/authenticate-azure", + "destination": "/docs/collaborate/git/authenticate-azure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/setup-azure", + "destination": "/docs/collaborate/git/setup-azure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-azure-devops", + "destination": "/docs/collaborate/git/connect-azure-devops", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-gitlab", + "destination": "/docs/collaborate/git/connect-gitlab", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-installing-the-github-application", + "destination": "/docs/collaborate/git/connect-github", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/setting-up", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/handling-merge-conflicts", + "destination": "/docs/collaborate/git/resolve-merge-conflicts", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/collaborate/cloud-build-and-view-your-docs", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories", + "destination": "/docs/collaborate/git/pr-template", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "destination": "/docs/deploy/cloud-ci-job", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-dashboard-status-tiles", + "destination": "/docs/deploy/dashboard-status-tiles", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness", + "destination": "/docs/deploy/source-freshness", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-notifications", + "destination": "/docs/deploy/job-notifications", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule", + "destination": "/docs/deploy/job-triggers", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/airgapped-deployment", + "destination": "/docs/deploy/airgapped-deployment", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/single-tenant-deployment", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/multi-tenant-deployment", + "destination": "/docs/deploy/multi-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/deployment-architecture", + "destination": "/docs/deploy/architecture", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/deployments/deployment-overview", + "destination": "/docs/deploy/deployments", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name", + "destination": "/docs/build/custom-target-names", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-aliases", + "destination": "/docs/build/custom-aliases", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "destination": "/docs/build/custom-databases", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-schemas", + "destination": "/docs/build/custom-schemas", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposures", + "destination": "/docs/dbt-cloud-apis/metadata-schema-exposures", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-exposure", + "destination": "/docs/dbt-cloud-apis/metadata-schema-exposure", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-tests", + "destination": "/docs/dbt-cloud-apis/metadata-schema-tests", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-test", + "destination": "/docs/dbt-cloud-apis/metadata-schema-test", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-snapshots", + "destination": "/docs/dbt-cloud-apis/metadata-schema-snapshots", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seeds", + "destination": "/docs/dbt-cloud-apis/metadata-schema-seeds", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-seed", + "destination": "/docs/dbt-cloud-apis/metadata-schema-seed", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-sources", + "destination": "/docs/dbt-cloud-apis/metadata-schema-sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-source", + "destination": "/docs/dbt-cloud-apis/metadata-schema-source", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metrics", + "destination": "/docs/dbt-cloud-apis/metadata-schema-metrics", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-metric", + "destination": "/docs/dbt-cloud-apis/metadata-schema-metric", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-modelByEnv", + "destination": "/docs/dbt-cloud-apis/metadata-schema-modelByEnv", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-models", + "destination": "/docs/dbt-cloud-apis/metadata-schema-models", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/schema/metadata-schema-model", + "destination": "/docs/dbt-cloud-apis/metadata-schema-model", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-querying", + "destination": "/docs/dbt-cloud-apis/metadata-querying", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/metadata/metadata-overview", + "destination": "/docs/dbt-cloud-apis/metadata-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/admin-cloud-api", + "destination": "/docs/dbt-cloud-apis/admin-cloud-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/service-tokens", + "destination": "/docs/dbt-cloud-apis/service-tokens", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/user-tokens", + "destination": "/docs/dbt-cloud-apis/user-tokens", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-api/cloud-apis", + "destination": "/docs/dbt-cloud-apis/overview", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/hooks-operations", + "destination": "/docs/build/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/analyses", + "destination": "/docs/build/analyses", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/package-management", + "destination": "/docs/build/packages", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-environment-variables", + "destination": "/docs/build/environment-variables", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-variables", + "destination": "/docs/build/project-variables", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/jinja-macros", + "destination": "/docs/build/jinja-macros", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/configuring-incremental-models", + "destination": "/docs/build/incremental-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/materializations", + "destination": "/docs/build/materializations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/tests", + "destination": "/docs/build/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/metrics", + "destination": "/docs/build/metrics", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/exposures", + "destination": "/docs/build/exposures", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/snapshots", + "destination": "/docs/build/snapshots", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/seeds", + "destination": "/docs/build/seeds", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models", + "destination": "/docs/build/sql-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/using-sources", + "destination": "/docs/build/sources", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/projects", + "destination": "/docs/build/projects", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/python-models", + "destination": "/docs/build/python-models", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/macros", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/setting-up", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-jinja-functions", + "destination": "/docs/guides/building-packages", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-upgrading-dbt-versions", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/core-versions", + "destination": "/docs/dbt-versions/core", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-dbt-cloud-support", + "destination": "/docs/dbt-support", + "permanent": true + }, + { + "source": "/docs/about/viewpoint", + "destination": "/community/resources/viewpoint", + "permanent": true + }, + { + "source": "/docs/viewpoint", + "destination": "/community/resources/viewpoint", + "permanent": true + }, + { + "source": "/dbt-cli/configure-your-profile", + "destination": "/docs/get-started/connection-profiles", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-cli", + "destination": "/docs/get-started/about-the-cli", + "permanent": true + }, + { + "source": "/dbt-cli/install/from-source", + "destination": "/docs/get-started/source-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/docker", + "destination": "/docs/get-started/docker-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/pip", + "destination": "/docs/get-started/pip-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/homebrew", + "destination": "/docs/get-started/homebrew-install", + "permanent": true + }, + { + "source": "/dbt-cli/install/overview", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/the-dbt-ide", + "destination": "/docs/get-started/dbt-cloud-features", + "permanent": true + }, + { + "source": "/((?!useful).*components)", + "destination": "https://github.com/dbt-labs/docs.getdbt.com/blob/current/contributing/adding-page-components.md", + "permanent": true + }, + { + "source": "/guides/legacy/managing-environments", + "destination": "/docs/building-a-dbt-project/managing-environments", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/dbt-api", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/img/docs/dbt-cloud/dbt-cloud-enterprise/icon.png", + "destination": "https://www.getdbt.com/ui/img/dbt-icon.png", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/centos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/centos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/install-from-source", + "destination": "/dbt-cli/install/from-source", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/macos", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/ubuntu-debian", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation-guides/windows", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-cli/installation", + "destination": "/docs/get-started/installation", + "permanent": true + }, + { + "source": "/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/docs", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/docs/adapter", + "destination": "/docs/writing-code-in-dbt/jinja-context/adapter", + "permanent": true + }, + { + "source": "/docs/analyses", + "destination": "/docs/building-a-dbt-project/analyses", + "permanent": true + }, + { + "source": "/docs/api-variable", + "destination": "/docs/writing-code-in-dbt/api-variable", + "permanent": true + }, + { + "source": "/docs/archival", + "destination": "/docs/building-a-dbt-project/archival", + "permanent": true + }, + { + "source": "/docs/artifacts", + "destination": "/docs/dbt-cloud/using-dbt-cloud/artifacts", + "permanent": true + }, + { + "source": "/docs/bigquery-configs", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/reference/resource-properties/docs", + "destination": "/reference/resource-configs/docs", + "permanent": true + }, + { + "source": "/reference/resource-properties/latest-version", + "destination": "/reference/resource-properties/latest_version", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/bigquery-configs", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/configuring-models", + "destination": "/reference/model-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/enable-and-disable-models", + "destination": "/reference/resource-configs/enabled", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/redshift-configs", + "destination": "/reference/resource-configs/redshift-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/snowflake-configs", + "destination": "/reference/resource-configs/snowflake-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/spark-configs", + "destination": "/reference/resource-configs/spark-configs", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-sql-headers", + "destination": "/reference/resource-configs/sql_header", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects", + "destination": "/docs/building-a-dbt-project/projects", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/configuring-query-comments", + "destination": "/reference/project-configs/query-comment", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/configuring-quoting", + "destination": "/reference/project-configs/quoting", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/creating-a-project", + "destination": "/docs/building-a-dbt-project/projects#creating-a-dbt-project", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions", + "destination": "/reference/project-configs/require-dbt-version", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-projects/use-an-existing-project", + "destination": "/docs/building-a-dbt-project/projects#using-an-existing-project", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/hooks", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "destination": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/documentation-website", + "destination": "/docs/building-a-dbt-project/testing-and-documentation/documentation", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/schemayml-files", + "destination": "/reference/declaring-properties", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/testing-and-documentation/testing", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/using-operations", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/building-models", + "destination": "/docs/building-a-dbt-project/building-models", + "permanent": true + }, + { + "source": "/docs/building-packages", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/centos", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/clean", + "destination": "/reference/commands/clean", + "permanent": true + }, + { + "source": "/docs/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "permanent": true + }, + { + "source": "/docs/cloud-configuring-dbt-cloud", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/deploy/cloud-ci-job", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration-with-github", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-enabling-continuous-integration", + "permanent": true + }, + { + "source": "/docs/cloud-generating-documentation", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-generating-documentation", + "permanent": true + }, + { + "source": "/docs/cloud-import-a-project-by-git-url", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/docs/cloud-installing-the-github-application", + "destination": "/docs/cloud/git/connect-github", + "permanent": true + }, + { + "source": "/docs/cloud-managing-permissions", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-managing-permissions", + "permanent": true + }, + { + "source": "/docs/cloud-overview", + "destination": "/docs/dbt-cloud/cloud-overview", + "permanent": true + }, + { + "source": "/docs/cloud-seats-and-users", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-seats-and-users", + "permanent": true + }, + { + "source": "/docs/cloud-setting-a-custom-target-name", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-setting-a-custom-target-name", + "permanent": true + }, + { + "source": "/docs/cloud-snapshotting-source-freshness", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-snapshotting-source-freshness", + "permanent": true + }, + { + "source": "/docs/cloud-supported-dbt-versions", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "permanent": true + }, + { + "source": "/docs/cloud-using-a-custom-cron-schedule", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-using-a-custom-cron-schedule", + "permanent": true + }, + { + "source": "/docs/cloud-using-a-managed-repository", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-using-a-managed-repository", + "permanent": true + }, + { + "source": "/docs/cmd-docs", + "destination": "/reference/commands/cmd-docs", + "permanent": true + }, + { + "source": "/docs/command-line-interface", + "destination": "/reference/dbt-commands", + "permanent": true + }, + { + "source": "/docs/compile", + "destination": "/reference/commands/compile", + "permanent": true + }, + { + "source": "/docs/config", + "destination": "/docs/writing-code-in-dbt/jinja-context/config", + "permanent": true + }, + { + "source": "/docs/configure-your-profile", + "destination": "/dbt-cli/configure-your-profile", + "permanent": true + }, + { + "source": "/docs/configuring-incremental-models", + "destination": "/docs/building-a-dbt-project/building-models/configuring-incremental-models", + "permanent": true + }, + { + "source": "/docs/configuring-models", + "destination": "/reference/model-configs", + "permanent": true + }, + { + "source": "/docs/configuring-query-comments", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-query-comments", + "permanent": true + }, + { + "source": "/docs/configuring-quoting", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-quoting", + "permanent": true + }, + { + "source": "/docs/configuring-resources-from-the-project-file", + "destination": "/docs/building-a-dbt-project/dbt-projects/configuring-resources-from-the-project-file", + "permanent": true + }, + { + "source": "/docs/connecting-your-database", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/connecting-your-database", + "permanent": true + }, + { + "source": "/docs/contributor-license-agreements", + "destination": "/docs/contributing/contributor-license-agreements", + "permanent": true + }, + { + "source": "/docs/creating-a-project", + "destination": "/docs/building-a-dbt-project/dbt-projects/creating-a-project", + "permanent": true + }, + { + "source": "/docs/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/creating-date-partitioned-tables", + "destination": "/docs/guides/database-specific-guides/creating-date-partitioned-tables", + "permanent": true + }, + { + "source": "/docs/custom-schema-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/database-specific-guides", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/dbt-api", + "destination": "/docs/running-a-dbt-project/dbt-api", + "permanent": true + }, + { + "source": "/docs/dbt-cloud-enterprise", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-repositories", + "destination": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-configuring-repositories", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-configuring-dbt-cloud/cloud-choosing-a-dbt-version", + "destination": "/docs/dbt-versions/upgrade-core-in-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions", + "destination": "/docs/dbt-cloud/access-control/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/architecture", + "destination": "/dbt-cloud/on-premises/dependencies", + "permanent": true + }, + { + "source": "/docs/dbt-projects", + "destination": "/docs/building-a-dbt-project/dbt-projects", + "permanent": true + }, + { + "source": "/docs/dbt_projectyml-file", + "destination": "/docs/building-a-dbt-project/dbt-projects/dbt_projectyml-file", + "permanent": true + }, + { + "source": "/docs/debug", + "destination": "/reference/commands/debug", + "permanent": true + }, + { + "source": "/docs/debug-method", + "destination": "/docs/writing-code-in-dbt/jinja-context/debug-method", + "permanent": true + }, + { + "source": "/docs/deps", + "destination": "/reference/commands/deps", + "permanent": true + }, + { + "source": "/docs/doc", + "destination": "/docs/writing-code-in-dbt/jinja-context/doc", + "permanent": true + }, + { + "source": "/docs/documentation", + "destination": "/docs/building-a-dbt-project/documentation", + "permanent": true + }, + { + "source": "/docs/documentation-website", + "destination": "/docs/building-a-dbt-project/documentation", + "permanent": true + }, + { + "source": "/docs/dont-nest-your-curlies", + "destination": "/docs/building-a-dbt-project/dont-nest-your-curlies", + "permanent": true + }, + { + "source": "/docs/enable-and-disable-models", + "destination": "/reference/resource-configs/enabled", + "permanent": true + }, + { + "source": "/docs/enterprise-permissions", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/enterprise-permissions", + "permanent": true + }, + { + "source": "/docs/env_var", + "destination": "/docs/writing-code-in-dbt/jinja-context/env_var", + "permanent": true + }, + { + "source": "/docs/exceptions", + "destination": "/docs/writing-code-in-dbt/jinja-context/exceptions", + "permanent": true + }, + { + "source": "/docs/execute", + "destination": "/docs/writing-code-in-dbt/jinja-context/execute", + "permanent": true + }, + { + "source": "/docs/exit-codes", + "destination": "/reference/exit-codes", + "permanent": true + }, + { + "source": "/docs/flags", + "destination": "/docs/writing-code-in-dbt/jinja-context/flags", + "permanent": true + }, + { + "source": "/docs/fromjson", + "destination": "/docs/writing-code-in-dbt/jinja-context/fromjson", + "permanent": true + }, + { + "source": "/docs/getting-started-with-jinja", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/global-cli-flags", + "destination": "/reference/global-cli-flags", + "permanent": true + }, + { + "source": "/docs/graph", + "destination": "/docs/writing-code-in-dbt/jinja-context/graph", + "permanent": true + }, + { + "source": "/docs/guides/building-packages", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/guides/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/guides/debugging-errors", + "destination": "/guides/legacy/debugging-errors", + "permanent": true + }, + { + "source": "/docs/guides/debugging-schema-names", + "destination": "/guides/legacy/debugging-schema-names", + "permanent": true + }, + { + "source": "/docs/guides/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/docs/guides/managing-environments", + "destination": "/guides/legacy/managing-environments", + "permanent": true + }, + { + "source": "/docs/guides/navigating-the-docs", + "destination": "/guides/legacy/navigating-the-docs", + "permanent": true + }, + { + "source": "/docs/guides/understanding-state", + "destination": "/guides/legacy/understanding-state", + "permanent": true + }, + { + "source": "/docs/guides/videos", + "destination": "/guides/legacy/videos", + "permanent": true + }, + { + "source": "/docs/guides/writing-custom-generic-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/guides/writing-custom-schema-tests", + "destination": "/guides/legacy/writing-custom-generic-tests", + "permanent": true + }, + { + "source": "/docs/guides/best-practices#choose-your-materializations-wisely", + "destination": "/guides/legacy/best-practices#choose-your-materializations-wisely", + "permanent": true + }, + { + "source": "/docs/guides/best-practices#version-control-your-dbt-project", + "destination": "/guides/legacy/best-practices#version-control-your-dbt-project", + "permanent": true + }, + { + "source": "/docs/best-practices", + "destination": "/guides/legacy/best-practices", + "permanent": true + }, + { + "source": "/docs/guides/best-practices", + "destination": "/guides/best-practices", + "permanent": true + }, + { + "source": "/docs/hooks", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/init", + "destination": "/reference/commands/init", + "permanent": true + }, + { + "source": "/docs/install-from-source", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/installation", + "destination": "/docs/core/installation", + "permanent": true + }, + { + "source": "/docs/invocation_id", + "destination": "/docs/writing-code-in-dbt/jinja-context/invocation_id", + "permanent": true + }, + { + "source": "/docs/jinja-context", + "destination": "/docs/writing-code-in-dbt/jinja-context", + "permanent": true + }, + { + "source": "/docs/license", + "destination": "/docs/about/license", + "permanent": true + }, + { + "source": "/docs/list", + "destination": "/reference/commands/list", + "permanent": true + }, + { + "source": "/docs/log", + "destination": "/docs/writing-code-in-dbt/jinja-context/log", + "permanent": true + }, + { + "source": "/docs/macos", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/macros", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/maintaining-multiple-environments-with-dbt", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/managing-environments", + "destination": "/guides/legacy/managing-environments", + "permanent": true + }, + { + "source": "/docs/materializations", + "destination": "/docs/building-a-dbt-project/building-models/materializations", + "permanent": true + }, + { + "source": "/docs/model-selection-syntax", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/docs/modules", + "destination": "/docs/writing-code-in-dbt/jinja-context/modules", + "permanent": true + }, + { + "source": "/docs/on-run-end-context", + "destination": "/docs/writing-code-in-dbt/jinja-context/on-run-end-context", + "permanent": true + }, + { + "source": "/docs/overview", + "destination": "/docs/introduction", + "permanent": true + }, + { + "source": "/docs/performance-optimization", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/package-management", + "destination": "/docs/building-a-dbt-project/package-management", + "permanent": true + }, + { + "source": "/docs/profile-bigquery", + "destination": "/reference/warehouse-profiles/bigquery-profile", + "permanent": true + }, + { + "source": "/docs/profile-mssql", + "destination": "/reference/warehouse-profiles/mssql-profile", + "permanent": true + }, + { + "source": "/docs/profile-postgres", + "destination": "/reference/warehouse-profiles/postgres-profile", + "permanent": true + }, + { + "source": "/docs/profile-presto", + "destination": "/reference/warehouse-profiles/presto-profile", + "permanent": true + }, + { + "source": "/docs/profile-redshift", + "destination": "/reference/warehouse-profiles/redshift-profile", + "permanent": true + }, + { + "source": "/docs/profile-snowflake", + "destination": "/reference/warehouse-profiles/snowflake-profile", + "permanent": true + }, + { + "source": "/docs/profile-spark", + "destination": "/reference/warehouse-profiles/spark-profile", + "permanent": true + }, + { + "source": "/docs/redshift-configs", + "destination": "/reference/resource-configs/redshift-configs", + "permanent": true + }, + { + "source": "/docs/spark-configs", + "destination": "/reference/resource-configs/spark-configs", + "permanent": true + }, + { + "source": "/docs/redshift-v2", + "destination": "/reference/warehouse-profiles/redshift-profile", + "permanent": true + }, + { + "source": "/docs/ref", + "destination": "/docs/writing-code-in-dbt/jinja-context/ref", + "permanent": true + }, + { + "source": "/docs/requiring-specific-dbt-versions", + "destination": "/docs/building-a-dbt-project/dbt-projects/requiring-specific-dbt-versions", + "permanent": true + }, + { + "source": "/docs/requiring-dbt-versions", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/return", + "destination": "/docs/writing-code-in-dbt/jinja-context/return", + "permanent": true + }, + { + "source": "/docs/rpc", + "destination": "/reference/commands/rpc", + "permanent": true + }, + { + "source": "/docs/run", + "destination": "/reference/commands/run", + "permanent": true + }, + { + "source": "/docs/run-operation", + "destination": "/reference/commands/run-operation", + "permanent": true + }, + { + "source": "/docs/run_query", + "destination": "/docs/writing-code-in-dbt/jinja-context/run_query", + "permanent": true + }, + { + "source": "/docs/run_started_at", + "destination": "/docs/writing-code-in-dbt/jinja-context/run_started_at", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface", + "destination": "/reference/dbt-commands", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/clean", + "destination": "/reference/commands/clean", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/cmd-docs", + "destination": "/reference/commands/cmd-docs", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/compile", + "destination": "/reference/commands/compile", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/debug", + "destination": "/reference/commands/debug", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/deps", + "destination": "/reference/commands/deps", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/exit-codes", + "destination": "/reference/exit-codes", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/global-cli-flags", + "destination": "/reference/global-cli-flags", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/init", + "destination": "/reference/commands/init", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/list", + "destination": "/reference/commands/list", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/model-selection-syntax", + "destination": "/reference/model-selection-syntax", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/rpc", + "destination": "/reference/commands/rpc", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/run", + "destination": "/reference/commands/run", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/run-operation", + "destination": "/reference/commands/run-operation", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/seed", + "destination": "/reference/commands/seed", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/snapshot", + "destination": "/reference/commands/snapshot", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/source", + "destination": "/reference/commands/source", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/test", + "destination": "/reference/commands/test", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/command-line-interface/version", + "destination": "/reference/global-cli-flags#version", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface", + "destination": "/docs/running-a-dbt-project/using-the-cli", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/centos", + "destination": "/dbt-cli/installation-guides/centos", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/configure-your-profile", + "destination": "/dbt-cli/configure-your-profile", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/install-from-source", + "destination": "/dbt-cli/installation-guides/install-from-source", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/installation", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/macos", + "destination": "/dbt-cli/installation-guides/macos", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/ubuntu-debian", + "destination": "/dbt-cli/installation-guides/ubuntu-debian", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-command-line-interface/windows", + "destination": "/dbt-cli/installation-guides/windows", + "permanent": true + }, + { + "source": "/docs/running-dbt-in-production", + "destination": "/docs/running-a-dbt-project/running-dbt-in-production", + "permanent": true + }, + { + "source": "/docs/schema", + "destination": "/docs/writing-code-in-dbt/jinja-context/schema", + "permanent": true + }, + { + "source": "/docs/schemas", + "destination": "/docs/writing-code-in-dbt/jinja-context/schemas", + "permanent": true + }, + { + "source": "/docs/schemayml-files", + "destination": "/reference/declaring-properties", + "permanent": true + }, + { + "source": "/docs/seed", + "destination": "/reference/commands/seed", + "permanent": true + }, + { + "source": "/docs/seeds", + "destination": "/docs/building-a-dbt-project/seeds", + "permanent": true + }, + { + "source": "/docs/setting-up-enterprise-sso-with-azure-active-directory", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-sso-with-azure-active-directory", + "permanent": true + }, + { + "source": "/docs/setting-up-snowflake-sso", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-enterprise-snowflake-oauth", + "permanent": true + }, + { + "source": "/docs/setting-up-sso-with-google-gsuite", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-google-gsuite", + "permanent": true + }, + { + "source": "/docs/setting-up-sso-with-okta", + "destination": "/docs/dbt-cloud/dbt-cloud-enterprise/setting-up-sso-with-okta", + "permanent": true + }, + { + "source": "/docs/snapshot", + "destination": "/reference/commands/snapshot", + "permanent": true + }, + { + "source": "/docs/snapshots", + "destination": "/docs/building-a-dbt-project/snapshots", + "permanent": true + }, + { + "source": "/docs/snowflake-configs", + "destination": "/reference/resource-configs/snowflake-configs", + "permanent": true + }, + { + "source": "/docs/source", + "destination": "/reference/commands/source", + "permanent": true + }, + { + "source": "/docs/statement-blocks", + "destination": "/docs/writing-code-in-dbt/jinja-context/statement-blocks", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-bigquery", + "destination": "/reference/bigquery-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-mssql", + "destination": "/reference/mssql-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-postgres", + "destination": "/reference/postgres-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-presto", + "destination": "/reference/presto-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-redshift", + "destination": "/reference/redshift-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-snowflake", + "destination": "/reference/snowflake-profile", + "permanent": true + }, + { + "source": "/docs/supported-databases/profile-spark", + "destination": "/reference/spark-profile", + "permanent": true + }, + { + "source": "/docs/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/docs/target", + "destination": "/docs/writing-code-in-dbt/jinja-context/target", + "permanent": true + }, + { + "source": "/docs/test", + "destination": "/reference/commands/test", + "permanent": true + }, + { + "source": "/docs/testing", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/testing-and-documentation", + "destination": "/docs/building-a-dbt-project/tests", + "permanent": true + }, + { + "source": "/docs/the-dbt-ide", + "destination": "/docs/cloud/about-cloud/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/this", + "destination": "/docs/writing-code-in-dbt/jinja-context/this", + "permanent": true + }, + { + "source": "/docs/tojson", + "destination": "/docs/writing-code-in-dbt/jinja-context/tojson", + "permanent": true + }, + { + "source": "/docs/ubuntu-debian", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/use-an-existing-project", + "destination": "/docs/building-a-dbt-project/dbt-projects/use-an-existing-project", + "permanent": true + }, + { + "source": "/docs/using-custom-aliases", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-aliases", + "permanent": true + }, + { + "source": "/docs/using-custom-database", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "permanent": true + }, + { + "source": "/docs/using-custom-schemas", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-schemas", + "permanent": true + }, + { + "source": "/docs/using-dbt-cloud", + "destination": "/docs/dbt-cloud/using-dbt-cloud", + "permanent": true + }, + { + "source": "/docs/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/docs/using-operations", + "destination": "/docs/building-a-dbt-project/hooks-operations", + "permanent": true + }, + { + "source": "/docs/using-sources", + "destination": "/docs/building-a-dbt-project/using-sources", + "permanent": true + }, + { + "source": "/docs/using-sql-headers", + "destination": "/reference/resource-configs/sql_header", + "permanent": true + }, + { + "source": "/docs/using-the-command-line-interface", + "destination": "/docs/running-a-dbt-project/using-the-cli", + "permanent": true + }, + { + "source": "/docs/using-the-dbt-ide", + "destination": "/docs/running-a-dbt-project/using-the-dbt-ide", + "permanent": true + }, + { + "source": "/docs/using-variables", + "destination": "/docs/building-a-dbt-project/building-models/using-variables", + "permanent": true + }, + { + "source": "/docs/var", + "destination": "/docs/writing-code-in-dbt/jinja-context/var", + "permanent": true + }, + { + "source": "/docs/version", + "destination": "/reference/global-cli-flags#version", + "permanent": true + }, + { + "source": "/docs/videos", + "destination": "/guides/legacy/videos", + "permanent": true + }, + { + "source": "/docs/warehouse-specific-configurations", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/windows", + "destination": "/dbt-cli/installation", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/api-variable", + "destination": "/", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/class-reference", + "destination": "/reference/dbt-classes", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/extending-dbts-programming-environment/creating-new-materializations", + "destination": "/guides/legacy/creating-new-materializations", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/extending-dbts-programming-environment/custom-schema-tests", + "destination": "/guides/legacy/writing-custom-schema-tests", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/getting-started-with-jinja", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/adapter", + "destination": "/reference/dbt-jinja-functions/adapter", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/as_text", + "destination": "/reference/dbt-jinja-functions/as_text", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/builtins", + "destination": "/reference/dbt-jinja-functions/builtins", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/config", + "destination": "/reference/dbt-jinja-functions/config", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/dbt-project-yml-context", + "destination": "/reference/dbt-jinja-functions/dbt-project-yml-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/dbt_version", + "destination": "/reference/dbt-jinja-functions/dbt_version", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/debug-method", + "destination": "/reference/dbt-jinja-functions/debug-method", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/doc", + "destination": "/reference/dbt-jinja-functions/doc", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/env_var", + "destination": "/reference/dbt-jinja-functions/env_var", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/exceptions", + "destination": "/reference/dbt-jinja-functions/exceptions", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/execute", + "destination": "/reference/dbt-jinja-functions/execute", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/flags", + "destination": "/reference/dbt-jinja-functions/flags", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/fromjson", + "destination": "/reference/dbt-jinja-functions/fromjson", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/fromyaml", + "destination": "/reference/dbt-jinja-functions/fromyaml", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/graph", + "destination": "/reference/dbt-jinja-functions/graph", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/invocation_id", + "destination": "/reference/dbt-jinja-functions/invocation_id", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/log", + "destination": "/reference/dbt-jinja-functions/log", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/modules", + "destination": "/reference/dbt-jinja-functions/modules", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/on-run-end-context", + "destination": "/reference/dbt-jinja-functions/on-run-end-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/profiles-yml-context", + "destination": "/reference/dbt-jinja-functions/profiles-yml-context", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/project_name", + "destination": "/reference/dbt-jinja-functions/project_name", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/ref", + "destination": "/reference/dbt-jinja-functions/ref", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/return", + "destination": "/reference/dbt-jinja-functions/return", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/run_query", + "destination": "/reference/dbt-jinja-functions/run_query", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/run_started_at", + "destination": "/reference/dbt-jinja-functions/run_started_at", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/schema", + "destination": "/reference/dbt-jinja-functions/schema", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/schemas", + "destination": "/reference/dbt-jinja-functions/schemas", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/source", + "destination": "/reference/dbt-jinja-functions/source", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/statement-blocks", + "destination": "/reference/dbt-jinja-functions/statement-blocks", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/target", + "destination": "/reference/dbt-jinja-functions/target", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/this", + "destination": "/reference/dbt-jinja-functions/this", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/tojson", + "destination": "/reference/dbt-jinja-functions/tojson", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/toyaml", + "destination": "/reference/dbt-jinja-functions/toyaml", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/jinja-context/var", + "destination": "/reference/dbt-jinja-functions/var", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/macros", + "destination": "/docs/building-a-dbt-project/jinja-macros", + "permanent": true + }, + { + "source": "/docs/writing-code-in-dbt/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/faqs/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/migration-guide/upgrading-to-0-17-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/migration-guide/upgrading-to-0-18-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/reference/accounts", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/api", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/bigquery-profile", + "destination": "/reference/warehouse-profile/bigquery-profile", + "permanent": true + }, + { + "source": "/reference/connections", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/data-test-configs", + "destination": "/reference/test-configs", + "permanent": true + }, + { + "source": "/reference/declaring-properties", + "destination": "/reference/configs-and-properties", + "permanent": true + }, + { + "source": "/reference/dbt-artifacts", + "destination": "/reference/artifacts/dbt-artifacts", + "permanent": true + }, + { + "source": "/reference/environments", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/events", + "destination": "/reference/events-logging", + "permanent": true + }, + { + "source": "/reference/jobs", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/model-selection-syntax", + "destination": "/reference/node-selection/syntax", + "permanent": true + }, + { + "source": "/reference/project-configs/on-run-end", + "destination": "/reference/project-configs/on-run-start-on-run-end", + "permanent": true + }, + { + "source": "/reference/project-configs/on-run-start", + "destination": "/reference/project-configs/on-run-start-on-run-end", + "permanent": true + }, + { + "source": "/reference/repositories", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/resource-configs/post-hook", + "destination": "/reference/resource-configs/pre-hook-post-hook", + "permanent": true + }, + { + "source": "/reference/resource-configs/pre-hook", + "destination": "/reference/resource-configs/pre-hook-post-hook", + "permanent": true + }, + { + "source": "/reference/resource-properties/tags", + "destination": "/reference/resource-configs/tags", + "permanent": true + }, + { + "source": "/reference/resource-properties/meta", + "destination": "/reference/resource-configs/meta", + "permanent": true + }, + { + "source": "/reference/runs", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/using-the-dbt-cloud-api", + "destination": "/dbt-cloud/api", + "permanent": true + }, + { + "source": "/reference/model-selection-syntax/#test-selection-examples", + "destination": "/reference/node-selection/test-selection-examples", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/building-models/using-custom-database", + "destination": "/docs/building-a-dbt-project/building-models/using-custom-databases", + "permanent": true + }, + { + "source": "/dbt-cloud/api", + "destination": "/dbt-cloud/api-v2", + "permanent": true + }, + { + "source": "/dbt-cloud/api-v2-old", + "destination": "/dbt-cloud/api-v2-legacy", + "permanent": true + }, + { + "source": "/dbt-cloud/api-v4", + "destination": "/docs/dbt-cloud-apis/admin-cloud-api", + "permanent": true + }, + { + "source": "/reference/project-configs/source-paths", + "destination": "/reference/project-configs/model-paths", + "permanent": true + }, + { + "source": "/reference/project-configs/data-paths", + "destination": "/reference/project-configs/seed-paths", + "permanent": true + }, + { + "source": "/reference/project-configs/modules-paths", + "destination": "/reference/project-configs/packages-install-path", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-slack-notifications", + "destination": "/docs/dbt-cloud/using-dbt-cloud/cloud-notifications", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/presto-profile", + "destination": "/reference/profiles.yml", + "permanent": true + }, + { + "source": "/setting-up", + "destination": "/guides/getting-started/getting-set-up/setting-up-bigquery", + "permanent": true + }, + { + "source": "/tutorial/setting-up", + "destination": "/quickstarts", + "permanent": true + }, + { + "source": "/tutorial/test-and-document-your-project", + "destination": "/guides/getting-started/building-your-first-project/test-and-document-your-project", + "permanent": true + }, + { + "source": "/tutorial/build-your-first-models", + "destination": "/guides/getting-started/building-your-first-project/build-your-first-models", + "permanent": true + }, + { + "source": "/tutorial/deploy-your-project", + "destination": "/guides/getting-started/building-your-first-project/schedule-a-job", + "permanent": true + }, + { + "source": "/tutorial/using-jinja", + "destination": "/guides/getting-started/learning-more/using-jinja", + "permanent": true + }, + { + "source": "/tutorial/2b-create-a-project-dbt-cli", + "destination": "/guides/getting-started/learning-more/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/tutorial/create-a-project-dbt-cli", + "destination": "/guides/getting-started/learning-more/getting-started-dbt-core", + "permanent": true + }, + { + "source": "/tutorial/2a-create-a-project-dbt-cloud", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/tutorial/create-a-project-dbt-cloud", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/tutorial/getting-started", + "destination": "/guides/getting-started", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-changelog", + "destination": "/docs/dbt-cloud/release-notes", + "permanent": true + }, + { + "source": "/faqs/all", + "destination": "/docs/faqs", + "permanent": true + }, + { + "source": "/faqs/:slug", + "destination": "/docs/faqs/:slug*", + "permanent": true + }, + { + "source": "/faqs/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/tutorial/learning-more/:slug", + "destination": "/guides/getting-started/learning-more/:slug*", + "permanent": true + }, + { + "source": "/tutorial/getting-set-up/:slug", + "destination": "/guides/getting-started/getting-set-up/:slug*", + "permanent": true + }, + { + "source": "/tutorial/building-your-first-project/:slug", + "destination": "/guides/getting-started/building-your-first-project/:slug*", + "permanent": true + }, + { + "source": "/tutorial/refactoring-legacy-sql", + "destination": "/guides/migration/tools/refactoring-legacy-sql", + "permanent": true + }, + { + "source": "/blog/change-data-capture-metrics", + "destination": "/blog/change-data-capture", + "permanent": true + }, + { + "source": "/blog/intelligent-slim-ci", + "destination": "/docs/deploy/continuous-integration", + "permanent": true + }, + { + "source": "/blog/model-timing-tab", + "destination": "/blog/how-we-shaved-90-minutes-off-model", + "permanent": true + }, + { + "source": "/reference/warehouse-setups/resource-configs/materialize-configs/indexes", + "destination": "/reference/resource-configs/materialize-configs#indexes", + "permanent": true + }, + { + "source": "/docs/build/building-models", + "destination": "/docs/build/models", + "permanent": true + }, + { + "source": "/docs/build/bigquery-profile", + "destination": "/reference/resource-configs/bigquery-configs", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/bigquery-setup", + "destination": "/reference/warehouse-setups/bigquery-setup", + "permanent": true + }, + { + "source": "/date-trunc-sql", + "destination": "/blog/date-trunc-sql", + "permanent": true + }, + { + "source": "/docs/using-hooks", + "destination": "/", + "permanent": true + }, + { + "source": "/blog/how-we-structure-our-dbt-projects", + "destination": "/guides/best-practices/how-we-structure/1-guide-overview", + "permanent": true + }, + { + "source": "/data-testing-why-you-need-it-and-how-to-get-started", + "destination": "https://www.getdbt.com/blog/data-quality-testing/", + "permanent": true + }, + { + "source": "/docs/profile", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/available-adapters", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/supported-databases", + "destination": "/docs/supported-data-platforms", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-14-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-15-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-16-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-17-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-18-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-19-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-from-0-10-to-0-11", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-014", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-014", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-0-14-1", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/upgrading-to-0-16-0", + "destination": "/guides/migration/versions", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-20-0", + "destination": "/guides/migration/versions/upgrading-to-v0.20", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-0-21-0", + "destination": "/guides/migration/versions/upgrading-to-v0.21", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-1-0-0", + "destination": "/guides/migration/versions/upgrading-to-v1.0", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/upgrading-to-v1.0", + "destination": "/guides/migration/versions/upgrading-to-v1.0", + "permanent": true + }, + { + "source": "/docs/guides/getting-help", + "destination": "/guides/legacy/getting-help", + "permanent": true + }, + { + "source": "/docs/guides/migration-guide/:slug", + "destination": "/guides/migration/versions/:slug*", + "permanent": true + }, + { + "source": "/docs/guides/:slug", + "destination": "/guides/legacy/:slug*", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/1-env-guide-overview", + "destination": "/guides/orchestration/set-up-ci/overview", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/2-one-deployment-environment", + "destination": "/guides/orchestration/set-up-ci/in-15-minutes", + "permanent": true + }, + { + "source": "/guides/best-practices/environment-setup/3-many-deployment-environments", + "destination": "/guides/orchestration/set-up-ci/multiple-environments", + "permanent": true + }, + { + "source": "/docs/contributing/what-are-adapters", + "destination": "/guides/advanced/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/1-what-are-adapters", + "destination": "/guides/advanced/adapter-development/1-what-are-adapters", + "permanent": true + }, + { + "source": "/docs/contributing/prerequisites-for-a-new-adapter", + "destination": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/2-prerequisites-for-a-new-adapter", + "destination": "/guides/advanced/adapter-development/2-prerequisites-for-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/3-building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/v0.13/docs/building-a-new-adapter", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/building-a-new-adapter", + "destination": "/guides/advanced/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/testing-a-new-adapter", + "destination": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/4-testing-a-new-adapter", + "destination": "/guides/advanced/adapter-development/4-testing-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/documenting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/5-documenting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/promoting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/6-promoting-a-new-adapter", + "destination": "/guides/advanced/adapter-development/6-promoting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/verifying-a-new-adapter", + "destination": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/contributing/adapter-development/7-verifying-a-new-adapter", + "destination": "/guides/advanced/adapter-development/7-verifying-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/using-dbt-cloud/cloud-metrics-layer", + "destination": "/docs/use-dbt-semantic-layer/dbt-semantic-layer", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/impala-profile", + "destination": "/reference/warehouse-setups/impala-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/exasol-profile", + "destination": "/reference/warehouse-setups/exasol-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/layer-profile", + "destination": "/reference/warehouse-setups/layer-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/postgres-profile", + "destination": "/reference/warehouse-setups/postgres-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/greenplum-profile", + "destination": "/reference/warehouse-setups/greenplum-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/alloydb-profile", + "destination": "/reference/warehouse-setups/alloydb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/azuresynapse-profile", + "destination": "/reference/warehouse-setups/azuresynapse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/snowflake-profile", + "destination": "/reference/warehouse-setups/snowflake-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/rockset-profile", + "destination": "/reference/warehouse-setups/rockset-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/trino-profile", + "destination": "/reference/warehouse-setups/trino-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/glue-profile", + "destination": "/reference/warehouse-setups/glue-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/duckdb-profile", + "destination": "/reference/warehouse-setups/duckdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/vertica-profile", + "destination": "/reference/warehouse-setups/vertica-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/clickhouse-profile", + "destination": "/reference/warehouse-setups/clickhouse-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/athena-profile", + "destination": "/reference/warehouse-setups/athena-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/iomete-profile", + "destination": "/reference/warehouse-setups/iomete-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mssql-profile", + "destination": "/reference/warehouse-setups/mssql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/tidb-profile", + "destination": "/reference/warehouse-setups/tidb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/materialize-profile", + "destination": "/reference/warehouse-setups/materialize-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/redshift-profile", + "destination": "/reference/warehouse-setups/redshift-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/databricks-profile", + "destination": "/reference/warehouse-setups/databricks-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/bigquery-profile", + "destination": "/reference/warehouse-setups/bigquery-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/dremio-profile", + "destination": "/reference/warehouse-setups/dremio-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/oracle-profile", + "destination": "/reference/warehouse-setups/oracle-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/teradata-profile", + "destination": "/reference/warehouse-setups/teradata-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/singlestore-profile", + "destination": "/reference/warehouse-setups/singlestore-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/sqlite-profile", + "destination": "/reference/warehouse-setups/sqlite-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/spark-profile", + "destination": "/reference/warehouse-setups/spark-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mindsdb-profile", + "destination": "/reference/warehouse-setups/mindsdb-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/ibmdb2-profile", + "destination": "/reference/warehouse-setups/ibmdb2-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/firebolt-profile", + "destination": "/reference/warehouse-setups/firebolt-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/mysql-profile", + "destination": "/reference/warehouse-setups/mysql-setup", + "permanent": true + }, + { + "source": "/reference/warehouse-profiles/hive-profile", + "destination": "/reference/warehouse-setups/hive-setup", + "permanent": true + }, + { + "source": "/reference/using-sources", + "destination": "/docs/build/sources", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/the-dbt-ide", + "destination": "/docs/getting-started/dbt-cloud-features", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/handling-merge-conflicts", + "destination": "/docs/collaborate/git/resolve-merge-conflicts", + "permanent": true + }, + { + "source": "/dbt-cloud/cloud-ide/viewing-docs-in-the-ide", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/cloud-ide/ide-beta", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/docs/running-a-dbt-project/using-the-dbt-ide", + "destination": "/docs/getting-started/develop-in-the-cloud", + "permanent": true + }, + { + "source": "/dbt-cloud/cloud-ide/the-ide-git-button", + "destination": "/docs/collaborate/git/version-control-basics", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/setting-up", + "destination": "/guides/legacy/building-packages", + "permanent": true + }, + { + "source": "/docs/building-a-dbt-project/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/docs/contributing/long-lived-discussions-guidelines", + "destination": "/community/resources/forum-guidelines", + "permanent": true + }, + { + "source": "/docs/guides/legacy/navigating-the-docs.md", + "destination": "/community/contribute", + "permanent": true + }, + { + "source": "/community/writing-on-discourse", + "destination": "/community/contributing/contributing-online-community", + "permanent": true + }, + { + "source": "/community/contributing", + "destination": "/community/contribute", + "permanent": true + }, + { + "source": "/docs/contributing/contributor-license-agreements", + "destination": "/community/resources/contributor-license-agreements", + "permanent": true + }, + { + "source": "/community/maintaining-a-channel", + "destination": "/community/resources/maintaining-a-channel", + "permanent": true + }, + { + "source": "/docs/contributing/oss-expectations", + "destination": "/community/resources/oss-expectations", + "permanent": true + }, + { + "source": "/docs/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/docs/contributing/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/community/resources/slack-rules-of-the-road", + "destination": "/community/resources/community-rules-of-the-road", + "permanent": true + }, + { + "source": "/blog/getting-started-with-the-dbt-semantic-layer", + "destination": "/blog/understanding-the-components-of-the-dbt-semantic-layer", + "permanent": true + }, + { + "source": "/docs/getting-started/develop-in-the-cloud#creating-a-development-environment", + "destination": "/docs/get-started/develop-in-the-cloud#set-up-and-access-the-cloud-ide", + "permanent": true + }, + { + "source": "/docs/cloud-developer-ide", + "destination": "/docs/build/custom-target-names#dbt-cloud-ide", + "permanent": true + }, + { + "source": "/website/docs/docs/contributing/building-a-new-adapter.md", + "destination": "/guides/dbt-ecosystem/adapter-development/3-building-a-new-adapter", + "permanent": true + }, + { + "source": "/guides/legacy/getting-help", + "destination": "/community/resources/getting-help", + "permanent": true + }, + { + "source": "/blog/tags/release-notes", + "destination": "/docs/dbt-versions/dbt-cloud-release-notes", + "permanent": true + }, + { + "source": "/faqs/dbt-jinja-functions", + "destination": "/reference/dbt-jinja-functions", + "permanent": true + }, + { + "source": "/website/docs/docs/contributing/documenting-a-new-adapter.md", + "destination": "/guides/dbt-ecosystem/adapter-development/5-documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/docs/docs/contributing/documenting-a-new-adapter", + "destination": "/docs/contributing/documenting-a-new-adapter", + "permanent": true + }, + { + "source": "/v0.8/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.10/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.12/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.13/reference", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.13/docs/requiring-dbt-versions", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.14/docs/cloud-developer-ide", + "destination": "/", + "permanent": true + }, + { + "source": "/v0.15/docs/cloud-import-a-project-by-git-url", + "destination": "/docs/cloud/git/import-a-project-by-git-url", + "permanent": true + }, + { + "source": "/v0.15/docs/configure-your-profile", + "destination": "/docs/core/connection-profiles", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/dependencies", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/faqs", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/index", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/installation", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/prerequisites", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/setup", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/system-requirements", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/docs/dbt-cloud/on-premises/upgrading-kots", + "destination": "/docs/deploy/single-tenant", + "permanent": true + }, + { + "source": "/reference/resource-properties/access", + "destination": "/reference/resource-configs/access", + "permanent": true + } + ] +} diff --git a/website/webpack.config.js b/website/webpack.config.js new file mode 100644 index 00000000000..b2266e25b2a --- /dev/null +++ b/website/webpack.config.js @@ -0,0 +1,19 @@ +module.exports = { + module: { + rules: [ + { + test: /\.(js|jsx)$/, // .js and .jsx files + exclude: /node_modules/, // excluding the node_modules folder + use: { + loader: "babel-loader", + options: { presets: ['@babel/env', '@babel/preset-react'] }, + }, + }, + { + test: /\.css$/i, + use: ["style-loader", "css-loader"], + }, + ], + }, +}; +