From 387b7a505958b1c3048bf76e027bda8de64d2df3 Mon Sep 17 00:00:00 2001 From: Pedram Navid <1045990+PedramNavid@users.noreply.github.com> Date: Tue, 13 Aug 2024 15:32:37 -0700 Subject: [PATCH] Add Passing Data Between Assets Guide (#23598) Added an initial Guide for quick feedback. Added a collapsible pre-req block . image Walk through three different ways of passing data between assets. I don't know how I feel about the last one. Also created a new Component called CodeExample which lets you embedand highlight code blocks. ``` ``` I have taken inspiration on how to write How To Guides from here: https://diataxis.fr/how-to-guides/ ## Outstanding Questions - Is the content accurate? - Is it at the right level of detail for a `How To`? - Do we like the collapsible code pre-req code block? - How does the sidebar experience feel? Currently Guides > Data Assets > How to pass data between assets --------- Co-authored-by: Sean Lopp --- .github/workflows/build-docs-revamp.yml | 10 ++ docs/.gitignore | 2 + docs/docs-next/docs/concepts/io-managers.md | 8 +- .../docs/concepts/understanding-assets.md | 9 ++ docs/docs-next/docs/guides/data-assets.md | 4 +- .../data-assets/adding-metadata-to-assets.md | 2 +- .../data-assets/creating-asset-factories.md | 2 +- .../data-assets/creating-data-assets.md | 2 +- .../creating-dependencies-between-assets.md | 2 +- .../passing-data-assets/passing-data-avoid.py | 20 +++ .../passing-data-explicit.py | 30 +++++ .../passing-data-io-manager.py | 33 +++++ .../passing-data-rewrite-assets.py | 25 ++++ .../passing-data-between-assets.md | 123 +++++++++++++++++- .../selecting-subsets-of-assets.md | 2 +- docs/docs-next/src/components/CodeExample.tsx | 44 +++++++ docs/docs-next/src/styles/custom.scss | 2 +- docs/docs-next/src/theme/MDXComponents.tsx | 2 + .../config/vocabularies/Dagster/accept.txt | 3 + pyright/master/exclude.txt | 1 + 20 files changed, 312 insertions(+), 14 deletions(-) create mode 100644 docs/docs-next/docs/concepts/understanding-assets.md create mode 100644 docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-avoid.py create mode 100644 docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-explicit.py create mode 100644 docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-io-manager.py create mode 100644 docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-rewrite-assets.py create mode 100644 docs/docs-next/src/components/CodeExample.tsx diff --git a/.github/workflows/build-docs-revamp.yml b/.github/workflows/build-docs-revamp.yml index 871ce662ddc07..7d63950f0ae47 100644 --- a/.github/workflows/build-docs-revamp.yml +++ b/.github/workflows/build-docs-revamp.yml @@ -31,6 +31,16 @@ jobs: vercel-token: ${{ secrets.VERCEL_TOKEN }} vercel-org-id: ${{ secrets.VERCEL_ORG_ID }} vercel-project-id: ${{ secrets.VERCEL_DOCS_NEXT_PROJECT_ID }} + github-token: ${{ secrets.GITHUB_TOKEN }} + scope: ${{ secrets.VERCEL_ORG_ID }} + + - name: Publish to Vercel Production + uses: amondnet/vercel-action@v25 + if: github.event_name == 'push' && github.ref == 'refs/heads/docs/revamp' + with: + vercel-token: ${{ secrets.VERCEL_TOKEN }} + vercel-org-id: ${{ secrets.VERCEL_ORG_ID }} + vercel-project-id: ${{ secrets.VERCEL_PROJECT_ID }} vercel-args: "--prod" github-token: ${{ secrets.GITHUB_TOKEN }} scope: ${{ secrets.VERCEL_ORG_ID }} diff --git a/docs/.gitignore b/docs/.gitignore index e7d746ffcb4c2..9888c6a493812 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,5 @@ sphinx/_build/ .env next/.env +*.duckdb +*.sqlite \ No newline at end of file diff --git a/docs/docs-next/docs/concepts/io-managers.md b/docs/docs-next/docs/concepts/io-managers.md index 078ce80e62986..3ba9237f7150a 100644 --- a/docs/docs-next/docs/concepts/io-managers.md +++ b/docs/docs-next/docs/concepts/io-managers.md @@ -1,5 +1,5 @@ ---- -title: "I/O managers" ---- +## ======= -# I/O managers \ No newline at end of file +## title: "I/O managers" + +# I/O managers diff --git a/docs/docs-next/docs/concepts/understanding-assets.md b/docs/docs-next/docs/concepts/understanding-assets.md new file mode 100644 index 0000000000000..c235cf7d4ad59 --- /dev/null +++ b/docs/docs-next/docs/concepts/understanding-assets.md @@ -0,0 +1,9 @@ +--- +title: Understanding Assets +description: Understanding the concept of assets in Dagster +last_update: + date: 2024-08-11 + author: Pedram Navid +--- + + diff --git a/docs/docs-next/docs/guides/data-assets.md b/docs/docs-next/docs/guides/data-assets.md index 1bae6ba05af26..535587f5198df 100644 --- a/docs/docs-next/docs/guides/data-assets.md +++ b/docs/docs-next/docs/guides/data-assets.md @@ -2,4 +2,6 @@ title: "Data assets" --- -# Data assets \ No newline at end of file +# Data assets + +TODO: fill in this section diff --git a/docs/docs-next/docs/guides/data-assets/adding-metadata-to-assets.md b/docs/docs-next/docs/guides/data-assets/adding-metadata-to-assets.md index 039961567a539..42c5929899538 100644 --- a/docs/docs-next/docs/guides/data-assets/adding-metadata-to-assets.md +++ b/docs/docs-next/docs/guides/data-assets/adding-metadata-to-assets.md @@ -1,6 +1,6 @@ --- title: "Adding metadata to assets" -sidebar_position: 4 +sidebar_position: 40 sidebar_label: "Adding metadata" --- diff --git a/docs/docs-next/docs/guides/data-assets/creating-asset-factories.md b/docs/docs-next/docs/guides/data-assets/creating-asset-factories.md index 05be6b5fe73f4..b0fb742c34927 100644 --- a/docs/docs-next/docs/guides/data-assets/creating-asset-factories.md +++ b/docs/docs-next/docs/guides/data-assets/creating-asset-factories.md @@ -1,6 +1,6 @@ --- title: "Creating asset factories" -sidebar_position: 5 +sidebar_position: 50 sidebar_label: "Creating asset factories" --- diff --git a/docs/docs-next/docs/guides/data-assets/creating-data-assets.md b/docs/docs-next/docs/guides/data-assets/creating-data-assets.md index c1d87d59c9d7f..6c5442dbef60c 100644 --- a/docs/docs-next/docs/guides/data-assets/creating-data-assets.md +++ b/docs/docs-next/docs/guides/data-assets/creating-data-assets.md @@ -1,6 +1,6 @@ --- title: "Creating data assets" -sidebar_position: 1 +sidebar_position: 10 sidebar_label: "Creating data assets" --- diff --git a/docs/docs-next/docs/guides/data-assets/creating-dependencies-between-assets.md b/docs/docs-next/docs/guides/data-assets/creating-dependencies-between-assets.md index 5b4f28029ec46..c9ac2f5b55efe 100644 --- a/docs/docs-next/docs/guides/data-assets/creating-dependencies-between-assets.md +++ b/docs/docs-next/docs/guides/data-assets/creating-dependencies-between-assets.md @@ -1,6 +1,6 @@ --- title: "Creating dependencies between assets" -sidebar_position: 2 +sidebar_position: 20 sidebar_label: "Creating asset dependencies" --- diff --git a/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-avoid.py b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-avoid.py new file mode 100644 index 0000000000000..ab5a24503e309 --- /dev/null +++ b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-avoid.py @@ -0,0 +1,20 @@ +from dagster import asset + + +# Warning! This is not the right way to create assets +@asset +def download_files(): + # Download files from S3, the web, etc. + ... + + +@asset +def unzip_files(): + # Unzip files to local disk or persistent storage + ... + + +@asset +def load_data(): + # Read data previously written and store in a data warehouse + ... diff --git a/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-explicit.py b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-explicit.py new file mode 100644 index 0000000000000..697ba076212c0 --- /dev/null +++ b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-explicit.py @@ -0,0 +1,30 @@ +import sqlite3 +import tempfile + +from dagster import AssetExecutionContext, Definitions, asset + +database_file = tempfile.NamedTemporaryFile() + + +# highlight-start +@asset +def asset1(): + with sqlite3.connect("database.sqlite") as conn: + conn.execute("CREATE OR REPLACE TABLE test (i INTEGER)") + conn.execute("INSERT INTO test VALUES (42)") + + +@asset(deps=[asset1]) +def asset2(context: AssetExecutionContext): + with sqlite3.connect("database.sqlite") as conn: + result = conn.execute("SELECT * FROM test").fetchall() + context.log.info(result) + # highlight-end + + +defs = Definitions(assets=[asset1, asset2]) + +if __name__ == "__main__": + from dagster import materialize + + materialize(assets=[asset1, asset2]) diff --git a/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-io-manager.py b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-io-manager.py new file mode 100644 index 0000000000000..fb754fd136b18 --- /dev/null +++ b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-io-manager.py @@ -0,0 +1,33 @@ +import pandas as pd +from dagster import Definitions, asset +from dagster_duckdb_pandas import DuckDBPandasIOManager + +# highlight-start +duckdb_io_manager = DuckDBPandasIOManager(database="my_database.duckdb", schema="my_schema") + + +@asset +def people(): + return pd.DataFrame({"id": [1, 2, 3], "name": ["Alice", "Bob", "Charlie"]}) + + +@asset +def birds(): + return pd.DataFrame({"id": [1, 2, 3], "name": ["Bluebird", "Robin", "Eagle"]}) + + +@asset +def combined_data(people, birds): + return pd.concat([people, birds]) + # highlight-end + + +defs = Definitions( + assets=[people, birds, combined_data], + resources={"io_manager": duckdb_io_manager}, +) + +if __name__ == "__main__": + from dagster import materialize + + materialize(assets=[people, birds, combined_data]) diff --git a/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-rewrite-assets.py b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-rewrite-assets.py new file mode 100644 index 0000000000000..753b26a4b4e21 --- /dev/null +++ b/docs/docs-next/docs/guides/data-assets/passing-data-assets/passing-data-rewrite-assets.py @@ -0,0 +1,25 @@ +from typing import List + +from dagster import asset + + +def download_files() -> str: + # Download files from S3, the web, etc. + ... + + +def unzip_files(zipfile: str) -> List[str]: + # Unzip files to local disk or persistent storage + ... + + +def load_data(files: List[str]): + # Read data previously written and store in a data warehouse + ... + + +@asset +def my_dataset(): + zipped_files = download_files() + files = unzip_files(zipped_files) + load_data(files) diff --git a/docs/docs-next/docs/guides/data-assets/passing-data-between-assets.md b/docs/docs-next/docs/guides/data-assets/passing-data-between-assets.md index 4080a6121a253..304dfe720eef0 100644 --- a/docs/docs-next/docs/guides/data-assets/passing-data-between-assets.md +++ b/docs/docs-next/docs/guides/data-assets/passing-data-between-assets.md @@ -1,7 +1,124 @@ --- -title: "Passing data between assets" -sidebar_position: 3 +title: How to Pass Data Between Assets +description: Learn how to pass data between assets in Dagster +sidebar_position: 30 sidebar_label: "Passing data between assets" +last_update: + date: 2024-08-11 + author: Pedram Navid --- -# Passing data between assets \ No newline at end of file +As you develop your data pipeline, you'll likely need to pass data between assets. By the end of this guide, you'll have a solid understanding of the different approaches to passing data between assets and when to use each one. + +--- + +
+ Prerequisites + +To follow the steps in this guide, you'll need: + +- A basic understanding of Dagster concepts such as assets and resources +- Dagster installed, as well as the `dagster-duckdb-pandas` package +
+ +--- + +## Overview + +In Dagster, assets are the building blocks of your data pipeline and it's common to want to pass data between them. This guide will help you understand how to pass data between assets. + +There are three ways of passing data between assets: + +- Explicitly managing data, by using external storage +- Implicitly managing data, using IO Managers +- Avoiding passing data between assets altogether by combining several tasks into a single asset + +This guide walks through all three methods. + +--- + +## Move Data Between Assets Explicitly Using External Storage + +A common and recommended approach to passing data between assets is explicitly managing data using external storage. This example pipeline uses a SQLite database as external storage: + + + +In this example, the first asset opens a connection to the SQLite database and writes data to it. The second asset opens a connection to the same database and reads data from it. The dependency between the first asset and the second asset is made explicit through the asset's `deps` argument. + +The benefits of this approach are: +- It's explicit and easy to understand how data is stored and retrieved +- You have maximum flexibility in terms of how and where data is stored, for example, based on environment + +The downsides of this approach are: +- You need to manage connections and transactions manually +- You need to handle errors and edge cases, for example, if the database is down or if a connection is closed + +## Move Data Between Assets Implicitly Using IO Managers + +Dagster's IO Managers are a powerful feature that manages data between assets by defining how data is read from and written to external storage. They help separate business logic from I/O operations, reducing boilerplate code and making it easier to change where data is stored. + +I/O managers handle: +1. **Input**: Reading data from storage and loading it into memory for use by dependent assets. +2. **Output**: Writing data to the configured storage location. + +For a deeper understanding of IO Managers, check out the [Understanding IO Managers](/concepts/io-managers) guide. + + + +In this example, a `DuckDBPandasIOManager` is instantiated to run using a local file. The IO manager handles both reading and writing to the database. + +:::warning + +This example works for local development, but in a production environment +each step would execute in a separate environment and would not have access to the same file system. Consider a cloud-hosted environment for production purposes. + +::: + +The `people()` and `birds()` assets both write their dataframes to DuckDB +for persistent storage. The `combined_data()` asset requests data from both assets by adding them as parameters to the function, and the IO Manager handles the reading them from DuckDB and making them available to the `combined_data` function as dataframes. Note that when you use IO Managers you do not need to manually add the asset's dependencies through the `deps` argument. + +The benefits of this approach are: +- The reading and writing of data is handled by the IO Manager, reducing boilerplate code +- It's easy to swap out different IO Managers based on environments without changing the underlying asset computation + +The downsides of this approach are: +- The IO Manager approach is less flexible should you need to customize how data is read or written to storage +- Some decisions may be made by the IO Manager for you, such as naming conventions that can be hard to override. + +## Avoid Passing Data Between Assets by Combining Assets + +In some cases, you may find that you can avoid passing data between assets by +carefully considering how you have modeled your pipeline: + +Consider this example: + + + +This example downloads a zip file from Google Drive, unzips it, and loads the data into a pandas DataFrame. It relies on each asset running on the same file system to perform these operations. + +The assets are modeled as tasks, rather than as data assets. For more information on the difference between tasks and data assets, check out the [Thinking in Assets](/concepts/assets/thinking-in-assets) guide. + +In this refactor, the `download_files`, `unzip_files`, and `load_data` assets are combined into a single asset, `my_dataset`. This asset downloads the files, unzips them, and loads the data into a data warehouse. + + + +This approach still handles passing data explicitly, but no longer does it across assets, +instead within a single asset. This pipeline still assumes enough disk and +memory available to handle the data, but for smaller datasets, it can work well. + +The benefits of this approach are: +- All the computation that defines how an asset is created is contained within a single asset, making it easier to understand and maintain +- It can be faster than relying on external storage, and doesn't require the overhead of setting up additional compute instances. + + +The downsides of this approach are: +- It makes certain assumptions about how much data is being processed +- It can be difficult to reuse functions across assets, since they're tightly coupled to the data they produce +- It may not always be possible to swap functionality based on the environment you are running in. For example, if you are running in a cloud environment, you may not have access to the local file system. + + +--- + +## Related Resources + +TODO: add links to relevant API documentation here. \ No newline at end of file diff --git a/docs/docs-next/docs/guides/data-assets/selecting-subsets-of-assets.md b/docs/docs-next/docs/guides/data-assets/selecting-subsets-of-assets.md index 2a0e4c47197ff..0ab522b4a9628 100644 --- a/docs/docs-next/docs/guides/data-assets/selecting-subsets-of-assets.md +++ b/docs/docs-next/docs/guides/data-assets/selecting-subsets-of-assets.md @@ -1,6 +1,6 @@ --- title: "Selecting subsets of assets" -sidebar_position: 6 +sidebar_position: 60 sidebar_label: "Selecting assets" --- diff --git a/docs/docs-next/src/components/CodeExample.tsx b/docs/docs-next/src/components/CodeExample.tsx new file mode 100644 index 0000000000000..e14da414aee17 --- /dev/null +++ b/docs/docs-next/src/components/CodeExample.tsx @@ -0,0 +1,44 @@ +import React from 'react'; +import CodeBlock from '@theme/CodeBlock'; + +interface CodeExampleProps { + filePath: string; + language?: string; + title?: string; +} + +const CodeExample: React.FC = ({ filePath, language, title }) => { + const [content, setContent] = React.useState(''); + const [error, setError] = React.useState(null); + + React.useEffect(() => { + // Adjust the import path to start from the docs directory + import(`!!raw-loader!/docs/${filePath}`) + .then((module) => { + const lines = module.default.split('\n'); + const mainIndex = lines.findIndex(line => line.trim().startsWith('if __name__ == ')); + const strippedContent = mainIndex !== -1 ? lines.slice(0, mainIndex).join('\n') : module.default; + setContent(strippedContent); + setError(null); + }) + .catch((error) => { + console.error(`Error loading file: ${filePath}`, error); + setError(`Failed to load file: ${filePath}. Please check if the file exists and the path is correct.`); + }); + }, [filePath]); + + if (error) { + return
{error}
; + } + + return ( + + {content || 'Loading...'} + + ); +}; + +export default CodeExample; \ No newline at end of file diff --git a/docs/docs-next/src/styles/custom.scss b/docs/docs-next/src/styles/custom.scss index dfddbc1be8f52..2e13b1cef3797 100644 --- a/docs/docs-next/src/styles/custom.scss +++ b/docs/docs-next/src/styles/custom.scss @@ -31,7 +31,7 @@ --ifm-color-primary-lightest: var(--dagster-color-gray-50); // modified base vars - --ifm-code-font-size: 90%; + --ifm-code-font-size: 80%; --ifm-navbar-height: 70px; // brand-specific colors diff --git a/docs/docs-next/src/theme/MDXComponents.tsx b/docs/docs-next/src/theme/MDXComponents.tsx index 1c3d86433c2b7..37c4ed3b0c833 100644 --- a/docs/docs-next/src/theme/MDXComponents.tsx +++ b/docs/docs-next/src/theme/MDXComponents.tsx @@ -1,6 +1,7 @@ // Import the original mapper import MDXComponents from "@theme-original/MDXComponents"; import { PyObject } from "../components/PyObject"; +import CodeExample from "../components/CodeExample"; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; export default { @@ -9,4 +10,5 @@ export default { PyObject, Tabs, TabItem, + CodeExample, }; \ No newline at end of file diff --git a/docs/vale/styles/config/vocabularies/Dagster/accept.txt b/docs/vale/styles/config/vocabularies/Dagster/accept.txt index 99cc5969713db..30f78495fa63a 100644 --- a/docs/vale/styles/config/vocabularies/Dagster/accept.txt +++ b/docs/vale/styles/config/vocabularies/Dagster/accept.txt @@ -7,3 +7,6 @@ SDA Extract Transform Load +dataframe +dataframes +DataFrame \ No newline at end of file diff --git a/pyright/master/exclude.txt b/pyright/master/exclude.txt index 2b9e39f028595..b603217067139 100644 --- a/pyright/master/exclude.txt +++ b/pyright/master/exclude.txt @@ -13,3 +13,4 @@ examples/project_dagster_university_start examples/project_du_dbt_starter examples/tutorial helm +docs/docs-next